Add logger.
This commit is contained in:
parent
e73962d8e1
commit
ca3450a4de
@ -1,11 +1,13 @@
|
|||||||
# data_utils.py
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import geopandas as gpd
|
import geopandas as gpd
|
||||||
from concurrent.futures import ThreadPoolExecutor as tpe
|
from concurrent.futures import ThreadPoolExecutor as tpe
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG, filename='data_utils.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger('data_utils.py')
|
||||||
|
|
||||||
|
|
||||||
def download_csv(url, local_filename):
|
def download_csv(url, local_filename):
|
||||||
@ -33,9 +35,9 @@ def process_urls(data_dir, urls_file):
|
|||||||
|
|
||||||
# Check if the file already exists
|
# Check if the file already exists
|
||||||
if not os.path.isfile(local_filename):
|
if not os.path.isfile(local_filename):
|
||||||
print(f"Downloading {url}...")
|
logger.debug(f"Downloading {url}...")
|
||||||
download_csv(url, local_filename)
|
download_csv(url, local_filename)
|
||||||
print(f"Saved to {local_filename}")
|
logger.debug(f"Saved to {local_filename}")
|
||||||
else:
|
else:
|
||||||
print(f"File {filename} already exists in {data_dir}, skipping download.")
|
print(f"File {filename} already exists in {data_dir}, skipping download.")
|
||||||
|
|
||||||
@ -45,7 +47,7 @@ def load_dataframe_from_csv(filepath):
|
|||||||
df = pd.read_csv(filepath, low_memory=False)
|
df = pd.read_csv(filepath, low_memory=False)
|
||||||
return df
|
return df
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error loading {filepath}: {e}")
|
logger.error(f"Error loading {filepath}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@ -75,11 +77,11 @@ def load_dataframes_from_geojson_files(data_dir, u_string):
|
|||||||
print('u_string', u_string)
|
print('u_string', u_string)
|
||||||
gdf = gpd.GeoDataFrame()
|
gdf = gpd.GeoDataFrame()
|
||||||
for filename in os.listdir(data_dir):
|
for filename in os.listdir(data_dir):
|
||||||
print("Filename:", filename)
|
#print("Filename:", filename)
|
||||||
if (u_string in filename) and filename.endswith('.json'):
|
if (u_string in filename) and filename.endswith('.json'):
|
||||||
filepath = os.path.join(data_dir, filename)
|
filepath = os.path.join(data_dir, filename)
|
||||||
print("Filepath:", filepath)
|
print("Filepath:", filepath)
|
||||||
gdf = gpd.read_file(filepath) # Read GeoJSON directly as GeoDataFrame
|
gdf = gpd.read_file(filepath)
|
||||||
|
|
||||||
return gdf
|
return gdf
|
||||||
|
|
||||||
@ -90,7 +92,7 @@ def combine_dataframes(dataframes):
|
|||||||
return combined_dataframe
|
return combined_dataframe
|
||||||
else:
|
else:
|
||||||
print("No dataframes to combine")
|
print("No dataframes to combine")
|
||||||
return pd.DataFrame() # Return an empty DataFrame
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
def create_unified_df(urls_file, u_string, data_dir, files_present=False):
|
def create_unified_df(urls_file, u_string, data_dir, files_present=False):
|
||||||
@ -110,7 +112,6 @@ def save_dataframe_to_csv(df, integrated_dir, filename):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Test the functions here if necessary
|
|
||||||
csv_urls_file = '../docs/all_csv_urls.txt'
|
csv_urls_file = '../docs/all_csv_urls.txt'
|
||||||
datasets_dir = 'datasets/'
|
datasets_dir = 'datasets/'
|
||||||
output_file = 'column_names.txt'
|
output_file = 'column_names.txt'
|
||||||
|
|||||||
@ -4,6 +4,12 @@ import os
|
|||||||
import requests
|
import requests
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG, filename='integrate.log',
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger('integrate.py')
|
||||||
|
|
||||||
foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt'
|
foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt'
|
||||||
miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt'
|
miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt'
|
||||||
accident_file_url = '../docs/accident_loc_urls.txt'
|
accident_file_url = '../docs/accident_loc_urls.txt'
|
||||||
@ -41,7 +47,7 @@ def process_foot_bike_data():
|
|||||||
|
|
||||||
|
|
||||||
def process_miv_data():
|
def process_miv_data():
|
||||||
miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir,files_present=True)
|
miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir, files_present=True)
|
||||||
|
|
||||||
miv_df_unified[['Date', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True)
|
miv_df_unified[['Date', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True)
|
||||||
miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True)
|
miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True)
|
||||||
@ -65,13 +71,12 @@ def process_accident_data():
|
|||||||
acc_cols_to_keep = ['AccidentUID', 'AccidentHour', 'AccidentYear', 'AccidentWeekDay_en', 'AccidentType',
|
acc_cols_to_keep = ['AccidentUID', 'AccidentHour', 'AccidentYear', 'AccidentWeekDay_en', 'AccidentType',
|
||||||
'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
|
'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
|
||||||
'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', 'AccidentLocation_CHLV95_E',
|
'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', 'AccidentLocation_CHLV95_E',
|
||||||
'AccidentLocation_CHLV95_N', 'geometry', 'AccidentMonth']
|
'AccidentLocation_CHLV95_N', 'AccidentMonth', 'geometry']
|
||||||
cleaned_acc_df = acc_df_unified[acc_cols_to_keep]
|
cleaned_acc_df = acc_df_unified[acc_cols_to_keep]
|
||||||
return cleaned_acc_df
|
return cleaned_acc_df
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
fb_df = process_miv_data()
|
acc_df = process_accident_data()
|
||||||
print(fb_df['MessungDatZeit'])
|
print(acc_df.dtypes)
|
||||||
print(fb_df.dtypes)
|
print(acc_df.head(100))
|
||||||
print(fb_df.head(100))
|
|
||||||
|
|||||||
@ -13,8 +13,8 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2023-12-02T23:43:55.980827Z",
|
"end_time": "2023-12-03T10:58:50.698090Z",
|
||||||
"start_time": "2023-12-02T23:43:55.546732Z"
|
"start_time": "2023-12-03T10:58:50.384352Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"id": "be55b25929d95559"
|
"id": "be55b25929d95559"
|
||||||
@ -44,8 +44,8 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2023-12-02T23:48:08.233784Z",
|
"end_time": "2023-12-03T11:01:14.422749Z",
|
||||||
"start_time": "2023-12-02T23:43:55.980667Z"
|
"start_time": "2023-12-03T10:58:52.300667Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"id": "dd3831953afdeb72"
|
"id": "dd3831953afdeb72"
|
||||||
@ -123,7 +123,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 3,
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
@ -163,8 +163,8 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2023-12-02T23:49:50.185415Z",
|
"end_time": "2023-12-03T11:15:51.051154Z",
|
||||||
"start_time": "2023-12-02T23:49:34.846049Z"
|
"start_time": "2023-12-03T11:15:36.154717Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"id": "f86bc612060b17a4"
|
"id": "f86bc612060b17a4"
|
||||||
|
|||||||
Reference in New Issue
Block a user