Add logger.
This commit is contained in:
parent
e73962d8e1
commit
ca3450a4de
@ -1,11 +1,13 @@
|
||||
# data_utils.py
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
import geopandas as gpd
|
||||
from concurrent.futures import ThreadPoolExecutor as tpe
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG, filename='data_utils.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger('data_utils.py')
|
||||
|
||||
|
||||
def download_csv(url, local_filename):
|
||||
@ -33,9 +35,9 @@ def process_urls(data_dir, urls_file):
|
||||
|
||||
# Check if the file already exists
|
||||
if not os.path.isfile(local_filename):
|
||||
print(f"Downloading {url}...")
|
||||
logger.debug(f"Downloading {url}...")
|
||||
download_csv(url, local_filename)
|
||||
print(f"Saved to {local_filename}")
|
||||
logger.debug(f"Saved to {local_filename}")
|
||||
else:
|
||||
print(f"File {filename} already exists in {data_dir}, skipping download.")
|
||||
|
||||
@ -45,7 +47,7 @@ def load_dataframe_from_csv(filepath):
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
return df
|
||||
except Exception as e:
|
||||
print(f"Error loading {filepath}: {e}")
|
||||
logger.error(f"Error loading {filepath}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
@ -75,11 +77,11 @@ def load_dataframes_from_geojson_files(data_dir, u_string):
|
||||
print('u_string', u_string)
|
||||
gdf = gpd.GeoDataFrame()
|
||||
for filename in os.listdir(data_dir):
|
||||
print("Filename:", filename)
|
||||
#print("Filename:", filename)
|
||||
if (u_string in filename) and filename.endswith('.json'):
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
print("Filepath:", filepath)
|
||||
gdf = gpd.read_file(filepath) # Read GeoJSON directly as GeoDataFrame
|
||||
gdf = gpd.read_file(filepath)
|
||||
|
||||
return gdf
|
||||
|
||||
@ -90,7 +92,7 @@ def combine_dataframes(dataframes):
|
||||
return combined_dataframe
|
||||
else:
|
||||
print("No dataframes to combine")
|
||||
return pd.DataFrame() # Return an empty DataFrame
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def create_unified_df(urls_file, u_string, data_dir, files_present=False):
|
||||
@ -110,7 +112,6 @@ def save_dataframe_to_csv(df, integrated_dir, filename):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test the functions here if necessary
|
||||
csv_urls_file = '../docs/all_csv_urls.txt'
|
||||
datasets_dir = 'datasets/'
|
||||
output_file = 'column_names.txt'
|
||||
|
||||
@ -4,6 +4,12 @@ import os
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG, filename='integrate.log',
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger('integrate.py')
|
||||
|
||||
foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt'
|
||||
miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt'
|
||||
accident_file_url = '../docs/accident_loc_urls.txt'
|
||||
@ -41,7 +47,7 @@ def process_foot_bike_data():
|
||||
|
||||
|
||||
def process_miv_data():
|
||||
miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir,files_present=True)
|
||||
miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir, files_present=True)
|
||||
|
||||
miv_df_unified[['Date', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True)
|
||||
miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True)
|
||||
@ -65,13 +71,12 @@ def process_accident_data():
|
||||
acc_cols_to_keep = ['AccidentUID', 'AccidentHour', 'AccidentYear', 'AccidentWeekDay_en', 'AccidentType',
|
||||
'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
|
||||
'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', 'AccidentLocation_CHLV95_E',
|
||||
'AccidentLocation_CHLV95_N', 'geometry', 'AccidentMonth']
|
||||
'AccidentLocation_CHLV95_N', 'AccidentMonth', 'geometry']
|
||||
cleaned_acc_df = acc_df_unified[acc_cols_to_keep]
|
||||
return cleaned_acc_df
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
fb_df = process_miv_data()
|
||||
print(fb_df['MessungDatZeit'])
|
||||
print(fb_df.dtypes)
|
||||
print(fb_df.head(100))
|
||||
acc_df = process_accident_data()
|
||||
print(acc_df.dtypes)
|
||||
print(acc_df.head(100))
|
||||
|
||||
@ -13,8 +13,8 @@
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-02T23:43:55.980827Z",
|
||||
"start_time": "2023-12-02T23:43:55.546732Z"
|
||||
"end_time": "2023-12-03T10:58:50.698090Z",
|
||||
"start_time": "2023-12-03T10:58:50.384352Z"
|
||||
}
|
||||
},
|
||||
"id": "be55b25929d95559"
|
||||
@ -44,8 +44,8 @@
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-02T23:48:08.233784Z",
|
||||
"start_time": "2023-12-02T23:43:55.980667Z"
|
||||
"end_time": "2023-12-03T11:01:14.422749Z",
|
||||
"start_time": "2023-12-03T10:58:52.300667Z"
|
||||
}
|
||||
},
|
||||
"id": "dd3831953afdeb72"
|
||||
@ -123,7 +123,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 3,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
@ -163,8 +163,8 @@
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-02T23:49:50.185415Z",
|
||||
"start_time": "2023-12-02T23:49:34.846049Z"
|
||||
"end_time": "2023-12-03T11:15:51.051154Z",
|
||||
"start_time": "2023-12-03T11:15:36.154717Z"
|
||||
}
|
||||
},
|
||||
"id": "f86bc612060b17a4"
|
||||
|
||||
Reference in New Issue
Block a user