From 77bf140efcbc0a33f16e13b172f62eddb93ad49a Mon Sep 17 00:00:00 2001
From: Sebastian Lenzlinger <74497638+sebaschi@users.noreply.github.com>
Date: Thu, 16 Nov 2023 18:40:42 +0100
Subject: [PATCH] Add Text File containing relevant datasource urls. First
diary entries written. Wiki entries on how to setup a python virtual env for
the project
---
docs/accident_loc_urls.txt | 1 +
docs/all_csv_urls.txt | 22 ++++++
docs/diary.md | 4 +
docs/foot_bike_zaehlung_urls.txt | 11 +++
docs/verkehrszaehlung_moto_urls.txt | 11 +++
src/data_utils.py | 118 ++++++++++++++++++++++++++++
src/integrate.py | 77 ++++++++++++++++++
src/preparations.py | 3 +
8 files changed, 247 insertions(+)
create mode 100644 docs/accident_loc_urls.txt
create mode 100644 docs/all_csv_urls.txt
create mode 100644 docs/foot_bike_zaehlung_urls.txt
create mode 100644 docs/verkehrszaehlung_moto_urls.txt
create mode 100644 src/data_utils.py
create mode 100644 src/integrate.py
create mode 100644 src/preparations.py
diff --git a/docs/accident_loc_urls.txt b/docs/accident_loc_urls.txt
new file mode 100644
index 0000000..1378079
--- /dev/null
+++ b/docs/accident_loc_urls.txt
@@ -0,0 +1 @@
+https://data.stadt-zuerich.ch/dataset/sid_dav_strassenverkehrsunfallorte/download/RoadTrafficAccidentLocations.json
diff --git a/docs/all_csv_urls.txt b/docs/all_csv_urls.txt
new file mode 100644
index 0000000..b9731a0
--- /dev/null
+++ b/docs/all_csv_urls.txt
@@ -0,0 +1,22 @@
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2012.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2013.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2014.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2015.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2016.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2021.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2022.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2012_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2013_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2014_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2015_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2016_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2017_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2018_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2019_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2020_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2021_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2022_verkehrszaehlungen_werte_fussgaenger_velo.csv
\ No newline at end of file
diff --git a/docs/diary.md b/docs/diary.md
index 5d4030c..fc63c34 100644
--- a/docs/diary.md
+++ b/docs/diary.md
@@ -1,3 +1,7 @@
+# TODOs
+* Write a script that makes tables and inserts the data.
+* Find out if data cleaning can be done in python with pandas or if it all must be SQL scipts.
+
# Project Diary
| Version
0.00 | Author:
michel.romancuk@stud.unibas.ch
sebastian.lenzlinger@unibas.ch
| HS 2023
Databases
|
diff --git a/docs/foot_bike_zaehlung_urls.txt b/docs/foot_bike_zaehlung_urls.txt
new file mode 100644
index 0000000..00f6353
--- /dev/null
+++ b/docs/foot_bike_zaehlung_urls.txt
@@ -0,0 +1,11 @@
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2012_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2013_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2014_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2015_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2016_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2017_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2018_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2019_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2020_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2021_verkehrszaehlungen_werte_fussgaenger_velo.csv
+https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2022_verkehrszaehlungen_werte_fussgaenger_velo.csv
\ No newline at end of file
diff --git a/docs/verkehrszaehlung_moto_urls.txt b/docs/verkehrszaehlung_moto_urls.txt
new file mode 100644
index 0000000..427888d
--- /dev/null
+++ b/docs/verkehrszaehlung_moto_urls.txt
@@ -0,0 +1,11 @@
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2012.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2013.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2014.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2015.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2016.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2021.csv
+https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2022.csv
\ No newline at end of file
diff --git a/src/data_utils.py b/src/data_utils.py
new file mode 100644
index 0000000..584619c
--- /dev/null
+++ b/src/data_utils.py
@@ -0,0 +1,118 @@
+# data_utils.py
+
+import os
+import pandas as pd
+import requests
+from urllib.parse import urlparse
+import geopandas as gpd
+from concurrent.futures import ThreadPoolExecutor as tpe
+
+
+def download_csv(url, local_filename):
+ with requests.get(url, stream=True) as r:
+ r.raise_for_status()
+ with open(local_filename, 'wb') as f:
+ for chunk in r.iter_content(chunk_size=8192):
+ f.write(chunk)
+
+
+def process_urls(data_dir, urls_file):
+ # Ensure the data directory exists
+ if not os.path.exists(data_dir):
+ os.makedirs(data_dir)
+
+ # Read URLs from the file
+ with open(urls_file, 'r') as file:
+ urls = file.readlines()
+
+ # Process each URL
+ for url in urls:
+ url = url.strip()
+ filename = os.path.basename(urlparse(url).path)
+ local_filename = os.path.join(data_dir, filename)
+
+ # Check if the file already exists
+ if not os.path.isfile(local_filename):
+ print(f"Downloading {url}...")
+ download_csv(url, local_filename)
+ print(f"Saved to {local_filename}")
+ else:
+ print(f"File {filename} already exists in {data_dir}, skipping download.")
+
+
+def load_dataframe_from_csv(filepath):
+ try:
+ df = pd.read_csv(filepath, low_memory=False)
+ return df
+ except Exception as e:
+ print(f"Error loading {filepath}: {e}")
+ return None
+
+
+def load_dataframes_from_csv_files(data_dir, u_string):
+ dataframes = []
+
+ with tpe(max_workers=5) as executor:
+ for filename in os.listdir(data_dir):
+ if (u_string in filename) and filename.endswith('.csv'):
+ filepath = os.path.join(data_dir, filename)
+ future = executor.submit(load_dataframe_from_csv, filepath)
+ dataframes.append(future)
+
+ dataframes = [future.result() for future in dataframes if future.result() is not None]
+
+ return dataframes
+
+ # for filename in os.listdir(data_dir):
+ # if (u_string in filename) and filename.endswith('.csv'):
+ # filepath = os.path.join(data_dir, filename)
+ # df = pd.read_csv(filepath, low_memory=False)
+ # dataframes.append(df)
+ # return dataframes
+
+
+def load_dataframes_from_geojson_files(data_dir, u_string):
+ print('u_string', u_string)
+ gdf = gpd.GeoDataFrame()
+ for filename in os.listdir(data_dir):
+ print("Filename:", filename)
+ if (u_string in filename) and filename.endswith('.json'):
+ filepath = os.path.join(data_dir, filename)
+ print("Filepath:", filepath)
+ gdf = gpd.read_file(filepath) # Read GeoJSON directly as GeoDataFrame
+
+ return gdf
+
+
+def combine_dataframes(dataframes):
+ if dataframes:
+ combined_dataframe = pd.concat(dataframes, ignore_index=True)
+ return combined_dataframe
+ else:
+ print("No dataframes to combine")
+ return pd.DataFrame() # Return an empty DataFrame
+
+
+def create_unified_df(urls_file, u_string, data_dir, files_present=False):
+ df_list = []
+ df_unified = None
+ if not files_present:
+ process_urls(data_dir, urls_file)
+
+ df_list = load_dataframes_from_csv_files(data_dir, u_string)
+ df_unified = combine_dataframes(df_list)
+
+ return df_unified
+
+
+def save_dataframe_to_csv(df, integrated_dir, filename):
+ pass
+
+
+if __name__ == "__main__":
+ # Test the functions here if necessary
+ csv_urls_file = '../docs/all_csv_urls.txt'
+ datasets_dir = 'datasets/'
+ output_file = 'column_names.txt'
+ process_urls(datasets_dir, csv_urls_file)
+ # extract_column_names(datasets_dir, output_file)
diff --git a/src/integrate.py b/src/integrate.py
new file mode 100644
index 0000000..050fcb4
--- /dev/null
+++ b/src/integrate.py
@@ -0,0 +1,77 @@
+import data_utils as du
+from datetime import datetime as dt
+import os
+import requests
+import pandas as pd
+
+foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt'
+miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt'
+accident_file_url = '../docs/accident_loc_urls.txt'
+
+# Using u_string to discriminate between files that belong to each other
+motor_file_u_string = 'sid_dav_verkehrszaehlung_miv_OD2031'
+foot_bike_file_u_string = 'velo.csv'
+accident_file_u_string = 'RoadTrafficAccidentLocations.json'
+
+data_dir = 'datasets/'
+integrated_dir = 'datasets/integrated/'
+
+weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
+
+
+def process_foot_bike_data():
+ fb_df_unified = du.create_unified_df(foot_bike_urls_file, foot_bike_file_u_string, data_dir, files_present=True)
+ fb_df_unified[['DATE', "TIME"]] = fb_df_unified['DATUM'].str.split('T', expand=True)
+ fb_df_unified[['HRS', 'MINS']] = fb_df_unified['TIME'].str.split(':', expand=True)
+ ## Evt brauchen wir doch FK_ZAEHLER
+ fb_cols_to_drop = ['DATUM']
+ fb_df_unified_correct_cols = fb_df_unified.drop(columns=fb_cols_to_drop, axis=1)
+ fb_df_unified_correct_cols.fillna(0, inplace=True)
+ fb_df_grouped = fb_df_unified_correct_cols.groupby(['OST', 'NORD', 'DATE', 'HRS']).agg({
+ 'VELO_IN': 'sum',
+ 'VELO_OUT': 'sum',
+ 'FUSS_IN': 'sum',
+ 'FUSS_OUT': 'sum'
+ }).reset_index()
+ dt_obj = pd.to_datetime(fb_df_grouped['DATE'])
+ days = dt_obj.dt.weekday
+ fb_df_grouped['Weekday_en'] = days.map(lambda x: weekday_names[x])
+ cleaned_fb_df = fb_df_grouped
+ return cleaned_fb_df
+
+
+def process_miv_data():
+ miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir,files_present=True)
+
+ miv_df_unified[['Date', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True)
+ miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True)
+
+ miv_cols_to_keep = ['MSID','ZSID','Achse', 'EKoord', 'NKoord', 'Richtung', 'AnzFahrzeuge', 'AnzFahrzeugeStatus',
+ 'Date', 'Hrs']
+ miv_df_cols_dropped = miv_df_unified[miv_cols_to_keep]
+
+ dt_obj = pd.to_datetime(miv_df_cols_dropped['Date'])
+ days = dt_obj.dt.weekday
+ miv_df_cols_dropped['Weekday_en'] = days.map(lambda x: weekday_names[x])
+
+
+ cleaned_miv_df = miv_df_cols_dropped
+ return cleaned_miv_df
+
+
+def process_accident_data():
+
+ acc_df_unified = du.load_dataframes_from_geojson_files(data_dir, accident_file_u_string)
+ acc_cols_to_keep = ['AccidentUID', 'AccidentHour', 'AccidentYear', 'AccidentWeekDay_en', 'AccidentType',
+ 'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
+ 'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', 'AccidentLocation_CHLV95_E',
+ 'AccidentLocation_CHLV95_N', 'geometry']
+ cleaned_acc_df = acc_df_unified[acc_cols_to_keep]
+ return cleaned_acc_df
+
+
+if __name__ == '__main__':
+ fb_df = process_miv_data()
+ print(fb_df['MessungDatZeit'])
+ print(fb_df.dtypes)
+ print(fb_df.head(100))
diff --git a/src/preparations.py b/src/preparations.py
new file mode 100644
index 0000000..499d1d6
--- /dev/null
+++ b/src/preparations.py
@@ -0,0 +1,3 @@
+import data_utils
+
+