Add logger.

2023-12-03 12:28:32 +01:00
parent e73962d8e1
commit ca3450a4de
3 changed files with 28 additions and 22 deletions
--- a/src/data_utils.py
+++ b/src/data_utils.py
@@ -1,11 +1,13 @@
 # data_utils.py
 import os
 import pandas as pd
 import requests
 from urllib.parse import urlparse
 import geopandas as gpd
 from concurrent.futures import ThreadPoolExecutor as tpe
 import logging
 logging.basicConfig(level=logging.DEBUG, filename='data_utils.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger('data_utils.py')
 def download_csv(url, local_filename):
@@ -33,9 +35,9 @@ def process_urls(data_dir, urls_file):
        # Check if the file already exists
        if not os.path.isfile(local_filename):
-            print(f"Downloading {url}...")
+            logger.debug(f"Downloading {url}...")
            download_csv(url, local_filename)
-            print(f"Saved to {local_filename}")
+            logger.debug(f"Saved to {local_filename}")
        else:
            print(f"File {filename} already exists in {data_dir}, skipping download.")
@@ -45,7 +47,7 @@ def load_dataframe_from_csv(filepath):
        df = pd.read_csv(filepath, low_memory=False)
        return df
    except Exception as e:
-        print(f"Error loading {filepath}: {e}")
+        logger.error(f"Error loading {filepath}: {e}")
        return None
@@ -75,11 +77,11 @@ def load_dataframes_from_geojson_files(data_dir, u_string):
    print('u_string', u_string)
    gdf = gpd.GeoDataFrame()
    for filename in os.listdir(data_dir):
-        print("Filename:", filename)
+        #print("Filename:", filename)
        if (u_string in filename) and filename.endswith('.json'):
            filepath = os.path.join(data_dir, filename)
            print("Filepath:", filepath)
-            gdf = gpd.read_file(filepath)  # Read GeoJSON directly as GeoDataFrame
+            gdf = gpd.read_file(filepath)
    return gdf
@@ -90,7 +92,7 @@ def combine_dataframes(dataframes):
        return combined_dataframe
    else:
        print("No dataframes to combine")
-        return pd.DataFrame()  # Return an empty DataFrame
+        return pd.DataFrame()
 def create_unified_df(urls_file, u_string, data_dir, files_present=False):
@@ -110,7 +112,6 @@ def save_dataframe_to_csv(df, integrated_dir, filename):
 if __name__ == "__main__":
    # Test the functions here if necessary
    csv_urls_file = '../docs/all_csv_urls.txt'
    datasets_dir = 'datasets/'
    output_file = 'column_names.txt'
--- a/src/integrate.py
+++ b/src/integrate.py
@@ -4,6 +4,12 @@ import os
 import requests
 import pandas as pd
 import logging
 logging.basicConfig(level=logging.DEBUG, filename='integrate.log',
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger('integrate.py')
 foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt'
 miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt'
 accident_file_url = '../docs/accident_loc_urls.txt'
@@ -41,7 +47,7 @@ def process_foot_bike_data():
 def process_miv_data():
-    miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir,files_present=True)
+    miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir, files_present=True)
    miv_df_unified[['Date', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True)
    miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True)
@@ -65,13 +71,12 @@ def process_accident_data():
    acc_cols_to_keep = ['AccidentUID', 'AccidentHour', 'AccidentYear', 'AccidentWeekDay_en', 'AccidentType',
                        'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
                        'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', 'AccidentLocation_CHLV95_E',
-                        'AccidentLocation_CHLV95_N', 'geometry', 'AccidentMonth']
+                        'AccidentLocation_CHLV95_N', 'AccidentMonth', 'geometry']
    cleaned_acc_df = acc_df_unified[acc_cols_to_keep]
    return cleaned_acc_df
 if __name__ == '__main__':
-    fb_df = process_miv_data()
+    acc_df = process_accident_data()
-    print(fb_df['MessungDatZeit'])
+    print(acc_df.dtypes)
-    print(fb_df.dtypes)
+    print(acc_df.head(100))
    print(fb_df.head(100))
--- a/src/testArea.ipynb
+++ b/src/testArea.ipynb
@@ -13,8 +13,8 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2023-12-02T23:43:55.980827Z",
+     "end_time": "2023-12-03T10:58:50.698090Z",
-     "start_time": "2023-12-02T23:43:55.546732Z"
+     "start_time": "2023-12-03T10:58:50.384352Z"
    }
   },
   "id": "be55b25929d95559"
@@ -44,8 +44,8 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2023-12-02T23:48:08.233784Z",
+     "end_time": "2023-12-03T11:01:14.422749Z",
-     "start_time": "2023-12-02T23:43:55.980667Z"
+     "start_time": "2023-12-03T10:58:52.300667Z"
    }
   },
   "id": "dd3831953afdeb72"
@@ -123,7 +123,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
   "outputs": [
    {
     "name": "stdout",
@@ -163,8 +163,8 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2023-12-02T23:49:50.185415Z",
+     "end_time": "2023-12-03T11:15:51.051154Z",
-     "start_time": "2023-12-02T23:49:34.846049Z"
+     "start_time": "2023-12-03T11:15:36.154717Z"
    }
   },
   "id": "f86bc612060b17a4"