From 77bf140efcbc0a33f16e13b172f62eddb93ad49a Mon Sep 17 00:00:00 2001 From: Sebastian Lenzlinger <74497638+sebaschi@users.noreply.github.com> Date: Thu, 16 Nov 2023 18:40:42 +0100 Subject: [PATCH] Add Text File containing relevant datasource urls. First diary entries written. Wiki entries on how to setup a python virtual env for the project --- docs/accident_loc_urls.txt | 1 + docs/all_csv_urls.txt | 22 ++++++ docs/diary.md | 4 + docs/foot_bike_zaehlung_urls.txt | 11 +++ docs/verkehrszaehlung_moto_urls.txt | 11 +++ src/data_utils.py | 118 ++++++++++++++++++++++++++++ src/integrate.py | 77 ++++++++++++++++++ src/preparations.py | 3 + 8 files changed, 247 insertions(+) create mode 100644 docs/accident_loc_urls.txt create mode 100644 docs/all_csv_urls.txt create mode 100644 docs/foot_bike_zaehlung_urls.txt create mode 100644 docs/verkehrszaehlung_moto_urls.txt create mode 100644 src/data_utils.py create mode 100644 src/integrate.py create mode 100644 src/preparations.py diff --git a/docs/accident_loc_urls.txt b/docs/accident_loc_urls.txt new file mode 100644 index 0000000..1378079 --- /dev/null +++ b/docs/accident_loc_urls.txt @@ -0,0 +1 @@ +https://data.stadt-zuerich.ch/dataset/sid_dav_strassenverkehrsunfallorte/download/RoadTrafficAccidentLocations.json diff --git a/docs/all_csv_urls.txt b/docs/all_csv_urls.txt new file mode 100644 index 0000000..b9731a0 --- /dev/null +++ b/docs/all_csv_urls.txt @@ -0,0 +1,22 @@ +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2012.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2013.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2014.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2015.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2016.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2021.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2022.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2012_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2013_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2014_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2015_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2016_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2017_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2018_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2019_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2020_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2021_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2022_verkehrszaehlungen_werte_fussgaenger_velo.csv \ No newline at end of file diff --git a/docs/diary.md b/docs/diary.md index 5d4030c..fc63c34 100644 --- a/docs/diary.md +++ b/docs/diary.md @@ -1,3 +1,7 @@ +# TODOs +* Write a script that makes tables and inserts the data. +* Find out if data cleaning can be done in python with pandas or if it all must be SQL scipts. + # Project Diary | Version
0.00 | Author:
michel.romancuk@stud.unibas.ch
sebastian.lenzlinger@unibas.ch
| HS 2023
Databases
| diff --git a/docs/foot_bike_zaehlung_urls.txt b/docs/foot_bike_zaehlung_urls.txt new file mode 100644 index 0000000..00f6353 --- /dev/null +++ b/docs/foot_bike_zaehlung_urls.txt @@ -0,0 +1,11 @@ +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2012_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2013_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2014_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2015_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2016_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2017_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2018_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2019_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2020_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2021_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2022_verkehrszaehlungen_werte_fussgaenger_velo.csv \ No newline at end of file diff --git a/docs/verkehrszaehlung_moto_urls.txt b/docs/verkehrszaehlung_moto_urls.txt new file mode 100644 index 0000000..427888d --- /dev/null +++ b/docs/verkehrszaehlung_moto_urls.txt @@ -0,0 +1,11 @@ +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2012.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2013.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2014.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2015.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2016.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2021.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2022.csv \ No newline at end of file diff --git a/src/data_utils.py b/src/data_utils.py new file mode 100644 index 0000000..584619c --- /dev/null +++ b/src/data_utils.py @@ -0,0 +1,118 @@ +# data_utils.py + +import os +import pandas as pd +import requests +from urllib.parse import urlparse +import geopandas as gpd +from concurrent.futures import ThreadPoolExecutor as tpe + + +def download_csv(url, local_filename): + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + +def process_urls(data_dir, urls_file): + # Ensure the data directory exists + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + # Read URLs from the file + with open(urls_file, 'r') as file: + urls = file.readlines() + + # Process each URL + for url in urls: + url = url.strip() + filename = os.path.basename(urlparse(url).path) + local_filename = os.path.join(data_dir, filename) + + # Check if the file already exists + if not os.path.isfile(local_filename): + print(f"Downloading {url}...") + download_csv(url, local_filename) + print(f"Saved to {local_filename}") + else: + print(f"File {filename} already exists in {data_dir}, skipping download.") + + +def load_dataframe_from_csv(filepath): + try: + df = pd.read_csv(filepath, low_memory=False) + return df + except Exception as e: + print(f"Error loading {filepath}: {e}") + return None + + +def load_dataframes_from_csv_files(data_dir, u_string): + dataframes = [] + + with tpe(max_workers=5) as executor: + for filename in os.listdir(data_dir): + if (u_string in filename) and filename.endswith('.csv'): + filepath = os.path.join(data_dir, filename) + future = executor.submit(load_dataframe_from_csv, filepath) + dataframes.append(future) + + dataframes = [future.result() for future in dataframes if future.result() is not None] + + return dataframes + + # for filename in os.listdir(data_dir): + # if (u_string in filename) and filename.endswith('.csv'): + # filepath = os.path.join(data_dir, filename) + # df = pd.read_csv(filepath, low_memory=False) + # dataframes.append(df) + # return dataframes + + +def load_dataframes_from_geojson_files(data_dir, u_string): + print('u_string', u_string) + gdf = gpd.GeoDataFrame() + for filename in os.listdir(data_dir): + print("Filename:", filename) + if (u_string in filename) and filename.endswith('.json'): + filepath = os.path.join(data_dir, filename) + print("Filepath:", filepath) + gdf = gpd.read_file(filepath) # Read GeoJSON directly as GeoDataFrame + + return gdf + + +def combine_dataframes(dataframes): + if dataframes: + combined_dataframe = pd.concat(dataframes, ignore_index=True) + return combined_dataframe + else: + print("No dataframes to combine") + return pd.DataFrame() # Return an empty DataFrame + + +def create_unified_df(urls_file, u_string, data_dir, files_present=False): + df_list = [] + df_unified = None + if not files_present: + process_urls(data_dir, urls_file) + + df_list = load_dataframes_from_csv_files(data_dir, u_string) + df_unified = combine_dataframes(df_list) + + return df_unified + + +def save_dataframe_to_csv(df, integrated_dir, filename): + pass + + +if __name__ == "__main__": + # Test the functions here if necessary + csv_urls_file = '../docs/all_csv_urls.txt' + datasets_dir = 'datasets/' + output_file = 'column_names.txt' + process_urls(datasets_dir, csv_urls_file) + # extract_column_names(datasets_dir, output_file) diff --git a/src/integrate.py b/src/integrate.py new file mode 100644 index 0000000..050fcb4 --- /dev/null +++ b/src/integrate.py @@ -0,0 +1,77 @@ +import data_utils as du +from datetime import datetime as dt +import os +import requests +import pandas as pd + +foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt' +miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt' +accident_file_url = '../docs/accident_loc_urls.txt' + +# Using u_string to discriminate between files that belong to each other +motor_file_u_string = 'sid_dav_verkehrszaehlung_miv_OD2031' +foot_bike_file_u_string = 'velo.csv' +accident_file_u_string = 'RoadTrafficAccidentLocations.json' + +data_dir = 'datasets/' +integrated_dir = 'datasets/integrated/' + +weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] + + +def process_foot_bike_data(): + fb_df_unified = du.create_unified_df(foot_bike_urls_file, foot_bike_file_u_string, data_dir, files_present=True) + fb_df_unified[['DATE', "TIME"]] = fb_df_unified['DATUM'].str.split('T', expand=True) + fb_df_unified[['HRS', 'MINS']] = fb_df_unified['TIME'].str.split(':', expand=True) + ## Evt brauchen wir doch FK_ZAEHLER + fb_cols_to_drop = ['DATUM'] + fb_df_unified_correct_cols = fb_df_unified.drop(columns=fb_cols_to_drop, axis=1) + fb_df_unified_correct_cols.fillna(0, inplace=True) + fb_df_grouped = fb_df_unified_correct_cols.groupby(['OST', 'NORD', 'DATE', 'HRS']).agg({ + 'VELO_IN': 'sum', + 'VELO_OUT': 'sum', + 'FUSS_IN': 'sum', + 'FUSS_OUT': 'sum' + }).reset_index() + dt_obj = pd.to_datetime(fb_df_grouped['DATE']) + days = dt_obj.dt.weekday + fb_df_grouped['Weekday_en'] = days.map(lambda x: weekday_names[x]) + cleaned_fb_df = fb_df_grouped + return cleaned_fb_df + + +def process_miv_data(): + miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir,files_present=True) + + miv_df_unified[['Date', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True) + miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True) + + miv_cols_to_keep = ['MSID','ZSID','Achse', 'EKoord', 'NKoord', 'Richtung', 'AnzFahrzeuge', 'AnzFahrzeugeStatus', + 'Date', 'Hrs'] + miv_df_cols_dropped = miv_df_unified[miv_cols_to_keep] + + dt_obj = pd.to_datetime(miv_df_cols_dropped['Date']) + days = dt_obj.dt.weekday + miv_df_cols_dropped['Weekday_en'] = days.map(lambda x: weekday_names[x]) + + + cleaned_miv_df = miv_df_cols_dropped + return cleaned_miv_df + + +def process_accident_data(): + + acc_df_unified = du.load_dataframes_from_geojson_files(data_dir, accident_file_u_string) + acc_cols_to_keep = ['AccidentUID', 'AccidentHour', 'AccidentYear', 'AccidentWeekDay_en', 'AccidentType', + 'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle', + 'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', 'AccidentLocation_CHLV95_E', + 'AccidentLocation_CHLV95_N', 'geometry'] + cleaned_acc_df = acc_df_unified[acc_cols_to_keep] + return cleaned_acc_df + + +if __name__ == '__main__': + fb_df = process_miv_data() + print(fb_df['MessungDatZeit']) + print(fb_df.dtypes) + print(fb_df.head(100)) diff --git a/src/preparations.py b/src/preparations.py new file mode 100644 index 0000000..499d1d6 --- /dev/null +++ b/src/preparations.py @@ -0,0 +1,3 @@ +import data_utils + +