Add logger.

This commit is contained in:
Sebastian Lenzlinger 2023-12-03 12:28:32 +01:00
parent e73962d8e1
commit ca3450a4de
3 changed files with 28 additions and 22 deletions

View File

@ -1,11 +1,13 @@
# data_utils.py
import os import os
import pandas as pd import pandas as pd
import requests import requests
from urllib.parse import urlparse from urllib.parse import urlparse
import geopandas as gpd import geopandas as gpd
from concurrent.futures import ThreadPoolExecutor as tpe from concurrent.futures import ThreadPoolExecutor as tpe
import logging
logging.basicConfig(level=logging.DEBUG, filename='data_utils.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('data_utils.py')
def download_csv(url, local_filename): def download_csv(url, local_filename):
@ -33,9 +35,9 @@ def process_urls(data_dir, urls_file):
# Check if the file already exists # Check if the file already exists
if not os.path.isfile(local_filename): if not os.path.isfile(local_filename):
print(f"Downloading {url}...") logger.debug(f"Downloading {url}...")
download_csv(url, local_filename) download_csv(url, local_filename)
print(f"Saved to {local_filename}") logger.debug(f"Saved to {local_filename}")
else: else:
print(f"File {filename} already exists in {data_dir}, skipping download.") print(f"File {filename} already exists in {data_dir}, skipping download.")
@ -45,7 +47,7 @@ def load_dataframe_from_csv(filepath):
df = pd.read_csv(filepath, low_memory=False) df = pd.read_csv(filepath, low_memory=False)
return df return df
except Exception as e: except Exception as e:
print(f"Error loading {filepath}: {e}") logger.error(f"Error loading {filepath}: {e}")
return None return None
@ -75,11 +77,11 @@ def load_dataframes_from_geojson_files(data_dir, u_string):
print('u_string', u_string) print('u_string', u_string)
gdf = gpd.GeoDataFrame() gdf = gpd.GeoDataFrame()
for filename in os.listdir(data_dir): for filename in os.listdir(data_dir):
print("Filename:", filename) #print("Filename:", filename)
if (u_string in filename) and filename.endswith('.json'): if (u_string in filename) and filename.endswith('.json'):
filepath = os.path.join(data_dir, filename) filepath = os.path.join(data_dir, filename)
print("Filepath:", filepath) print("Filepath:", filepath)
gdf = gpd.read_file(filepath) # Read GeoJSON directly as GeoDataFrame gdf = gpd.read_file(filepath)
return gdf return gdf
@ -90,7 +92,7 @@ def combine_dataframes(dataframes):
return combined_dataframe return combined_dataframe
else: else:
print("No dataframes to combine") print("No dataframes to combine")
return pd.DataFrame() # Return an empty DataFrame return pd.DataFrame()
def create_unified_df(urls_file, u_string, data_dir, files_present=False): def create_unified_df(urls_file, u_string, data_dir, files_present=False):
@ -110,7 +112,6 @@ def save_dataframe_to_csv(df, integrated_dir, filename):
if __name__ == "__main__": if __name__ == "__main__":
# Test the functions here if necessary
csv_urls_file = '../docs/all_csv_urls.txt' csv_urls_file = '../docs/all_csv_urls.txt'
datasets_dir = 'datasets/' datasets_dir = 'datasets/'
output_file = 'column_names.txt' output_file = 'column_names.txt'

View File

@ -4,6 +4,12 @@ import os
import requests import requests
import pandas as pd import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG, filename='integrate.log',
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('integrate.py')
foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt' foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt'
miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt' miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt'
accident_file_url = '../docs/accident_loc_urls.txt' accident_file_url = '../docs/accident_loc_urls.txt'
@ -41,7 +47,7 @@ def process_foot_bike_data():
def process_miv_data(): def process_miv_data():
miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir,files_present=True) miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir, files_present=True)
miv_df_unified[['Date', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True) miv_df_unified[['Date', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True)
miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True) miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True)
@ -65,13 +71,12 @@ def process_accident_data():
acc_cols_to_keep = ['AccidentUID', 'AccidentHour', 'AccidentYear', 'AccidentWeekDay_en', 'AccidentType', acc_cols_to_keep = ['AccidentUID', 'AccidentHour', 'AccidentYear', 'AccidentWeekDay_en', 'AccidentType',
'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle', 'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', 'AccidentLocation_CHLV95_E', 'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', 'AccidentLocation_CHLV95_E',
'AccidentLocation_CHLV95_N', 'geometry', 'AccidentMonth'] 'AccidentLocation_CHLV95_N', 'AccidentMonth', 'geometry']
cleaned_acc_df = acc_df_unified[acc_cols_to_keep] cleaned_acc_df = acc_df_unified[acc_cols_to_keep]
return cleaned_acc_df return cleaned_acc_df
if __name__ == '__main__': if __name__ == '__main__':
fb_df = process_miv_data() acc_df = process_accident_data()
print(fb_df['MessungDatZeit']) print(acc_df.dtypes)
print(fb_df.dtypes) print(acc_df.head(100))
print(fb_df.head(100))

View File

@ -13,8 +13,8 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"end_time": "2023-12-02T23:43:55.980827Z", "end_time": "2023-12-03T10:58:50.698090Z",
"start_time": "2023-12-02T23:43:55.546732Z" "start_time": "2023-12-03T10:58:50.384352Z"
} }
}, },
"id": "be55b25929d95559" "id": "be55b25929d95559"
@ -44,8 +44,8 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"end_time": "2023-12-02T23:48:08.233784Z", "end_time": "2023-12-03T11:01:14.422749Z",
"start_time": "2023-12-02T23:43:55.980667Z" "start_time": "2023-12-03T10:58:52.300667Z"
} }
}, },
"id": "dd3831953afdeb72" "id": "dd3831953afdeb72"
@ -123,7 +123,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 3,
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
@ -163,8 +163,8 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"end_time": "2023-12-02T23:49:50.185415Z", "end_time": "2023-12-03T11:15:51.051154Z",
"start_time": "2023-12-02T23:49:34.846049Z" "start_time": "2023-12-03T11:15:36.154717Z"
} }
}, },
"id": "f86bc612060b17a4" "id": "f86bc612060b17a4"