diff --git a/.gitignore b/.gitignore index 89af448..c96c287 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Created by https://www.toptal.com/developers/gitignore/api/python,pycharm,linux,macos,database,data # Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm,linux,macos,database,data datasets/ +db23-project-venv/ ### Data ### *.csv *.dat diff --git a/DataExploration.ipynb b/DataExploration.ipynb index a52ecca..0d62426 100644 --- a/DataExploration.ipynb +++ b/DataExploration.ipynb @@ -2,14 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 20, + "execution_count": 1, "id": "17ca2acb", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, - "tags": [] + "tags": [], + "ExecuteTime": { + "end_time": "2023-11-16T16:20:49.426349Z", + "start_time": "2023-11-16T16:20:16.117316Z" + } }, "outputs": [ { @@ -156,7 +160,11 @@ "2 Unbekannt 2016-01-01T02:00:00 2021-02-03 213.0 Gemessen \n", "3 Unbekannt 2016-01-01T03:00:00 2021-02-03 112.0 Gemessen \n", "4 Unbekannt 2016-01-01T04:00:00 2021-02-03 80.0 Gemessen \n", - "Data for year 2017:\n", + "File not found for year 2017: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv\n", + "File not found for year 2018: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv\n", + "File not found for year 2019: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv\n", + "File not found for year 2020: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv\n", + "Data for year 2021:\n", " MSID MSName ZSID ZSName Achse \\\n", "0 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n", "1 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n", @@ -179,34 +187,39 @@ "4 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n", "\n", " D4ID MessungDatZeit LieferDat AnzFahrzeuge AnzFahrzeugeStatus \n", - "0 Unbekannt 2017-01-01T00:00:00 2021-02-03 295.0 Gemessen \n", - "1 Unbekannt 2017-01-01T01:00:00 2021-02-03 264.0 Gemessen \n", - "2 Unbekannt 2017-01-01T02:00:00 2021-02-03 180.0 Gemessen \n", - "3 Unbekannt 2017-01-01T03:00:00 2021-02-03 107.0 Gemessen \n", - "4 Unbekannt 2017-01-01T04:00:00 2021-02-03 97.0 Gemessen \n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_311061/2135127822.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlow_memory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mtable_key\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"miv_{year}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 946\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 947\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 948\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 950\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 615\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 617\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 618\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1746\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1747\u001b[0m \u001b[0mcol_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1748\u001b[0;31m \u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1749\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1750\u001b[0m )\n", - "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 239\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 240\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers._maybe_upcast\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/numpy/core/multiarray.py\u001b[0m in \u001b[0;36mputmask\u001b[0;34m(a, mask, values)\u001b[0m\n\u001b[1;32m 1129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1130\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1131\u001b[0;31m \u001b[0;34m@\u001b[0m\u001b[0marray_function_from_c_func_and_dispatcher\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_multiarray_umath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mputmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1132\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mputmask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m/\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1133\u001b[0m \"\"\"\n", - "\u001b[0;32m_pydevd_bundle/pydevd_cython.pyx\u001b[0m in \u001b[0;36m_pydevd_bundle.pydevd_cython.SafeCallWrapper.__call__\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m_pydevd_bundle/pydevd_cython.pyx\u001b[0m in \u001b[0;36m_pydevd_bundle.pydevd_cython.ThreadTracer.__call__\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/debugpy/_vendored/pydevd/_pydev_bundle/pydev_is_thread_alive.py\u001b[0m in \u001b[0;36mis_thread_alive\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_temp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'_is_stopped'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Python 3.x has this\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mis_thread_alive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_stopped\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + "0 Unbekannt 2021-01-01T00:00:00 2021-12-30 122.0 Gemessen \n", + "1 Unbekannt 2021-01-01T01:00:00 2021-12-30 177.0 Gemessen \n", + "2 Unbekannt 2021-01-01T02:00:00 2021-12-30 125.0 Gemessen \n", + "3 Unbekannt 2021-01-01T03:00:00 2021-12-30 84.0 Gemessen \n", + "4 Unbekannt 2021-01-01T04:00:00 2021-12-30 49.0 Gemessen \n", + "Data for year 2022:\n", + " MSID MSName ZSID ZSName Achse \\\n", + "0 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n", + "1 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n", + "2 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n", + "3 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n", + "4 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n", + "\n", + " HNr Hoehe EKoord NKoord Richtung Knummer \\\n", + "0 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n", + "1 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n", + "2 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n", + "3 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n", + "4 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n", + "\n", + " Kname AnzDetektoren D1ID D2ID D3ID \\\n", + "0 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n", + "1 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n", + "2 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n", + "3 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n", + "4 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n", + "\n", + " D4ID MessungDatZeit LieferDat AnzFahrzeuge AnzFahrzeugeStatus \n", + "0 Unbekannt 2022-01-01T00:00:00 2022-12-30 166.0 Gemessen \n", + "1 Unbekannt 2022-01-01T01:00:00 2022-12-30 255.0 Gemessen \n", + "2 Unbekannt 2022-01-01T02:00:00 2022-12-30 168.0 Gemessen \n", + "3 Unbekannt 2022-01-01T03:00:00 2022-12-30 96.0 Gemessen \n", + "4 Unbekannt 2022-01-01T04:00:00 2022-12-30 63.0 Gemessen \n" ] } ], @@ -219,7 +232,8 @@ "\n", "tables_dict = {}\n", "pd.set_option(\"display.max_columns\", None)\n", - "data_dir = 'data/'\n", + "data_dir = 'datasets/'\n", + "\n", "for year in range(2012, 2023):\n", " file_name = f'sid_dav_verkehrszaehlung_miv_OD2031_{year}.csv'\n", " file_path = os.path.join(data_dir, file_name)\n", @@ -701,7 +715,7 @@ "else:\n", " print(f\"Failed to download data. Status code: {response.status_code}\")\n", "\n", - "accidents_file_path = os.path.join(data_dir, ')\n" + "accidents_file_path = os.path.join(data_dir)\n" ] } ], diff --git a/README.md b/README.md index 318ce9a..bae52c9 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,14 @@ -# Databases Project +# Database Project Group 1 -Use this repository for your integration code and any source code created while working on your project (ad-hoc code, -SQL queries, project files needed by external tools, etc.). +## Preliminaries +* Ensure you have access to a running postgres instance +* Ensure you have ```python3``` and ```pip``` installed. +* From within the root of the project, run ```pip install -r requirements.txt```. This insures all python dependecies are met. +* In ```src/fill_db.py``` look for the ```db_info``` variable and adapt it to your credentials. -- Merge your code into the main branch on the due date. -- Do not commit datasets! -- Any other document (except for the dump in the final hand-in) should be handed-in via ADAM. - -If you have any questions regarding the project, please do not hesitate to ask during the exercise lessons or via mail -to [raphael.waltenspuel@unibas.ch](mailto:raphael.waltenspuel@unibas.ch)! - -It is recommended that you first create a ```.gitignore``` file. (And exclude the "datasets" folder, for example). A useful tool for creating ```.gitignore``` files is www.gitignore.io. - -Feel free to update or replace this readme with a brief description of your project and goals. - -### Database setup guide - -1. Make sure all the requirements in ’requirements.txt’ are met. If they are not -met, run pip install -r requirements.txt in the root of the project. -2. Run the python script ’integrate.py’ in the ’src’ folder. Set all booleans to -’False’ in the main methode of the script. If the datasets have already been -downloaded, set all the booleans to ’True’. The datasets need to be in a -folder named ’datasets’ in ’src’ (this should be set up automatically by the -script). -3. Ensure you have a running Postgres instance with a database. -4. Ensure you have the correct credentials in the python script ’fill_db.py’ in -’dbinfo’ -5. Run ’fill_db.py’ \ No newline at end of file +## Action +In the following the order matters. +1. Run ```unsure_dirs_exist.py```. This makes sure all the directories needed to perform the data integration and logging exist. +1. Run ```integrate.py```. Adjust the main method to fit your needs. In particular adjust the ```process_all_data()``` method, such that the parameter corresponding to a dataset is ```False``` if the script shall download it form the internet, and ```True``` else. To get geojson data form signaled speed in to city of Zurich uncomment the line in the ``main`` method where you find ```load_tempo_geojson_from_api_to_local()``` +2. Run ```fill_db.py```. This will load the data into the database based on the credentials given in the ``db_info`` variable. +3. Perform Analysis. \ No newline at end of file diff --git a/docs/accident_loc_urls.txt b/docs/accident_loc_urls.txt new file mode 100644 index 0000000..1378079 --- /dev/null +++ b/docs/accident_loc_urls.txt @@ -0,0 +1 @@ +https://data.stadt-zuerich.ch/dataset/sid_dav_strassenverkehrsunfallorte/download/RoadTrafficAccidentLocations.json diff --git a/docs/all_csv_urls.txt b/docs/all_csv_urls.txt new file mode 100644 index 0000000..b9731a0 --- /dev/null +++ b/docs/all_csv_urls.txt @@ -0,0 +1,22 @@ +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2012.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2013.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2014.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2015.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2016.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2021.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2022.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2012_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2013_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2014_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2015_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2016_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2017_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2018_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2019_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2020_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2021_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2022_verkehrszaehlungen_werte_fussgaenger_velo.csv \ No newline at end of file diff --git a/docs/diary.md b/docs/diary.md index e69de29..fc63c34 100644 --- a/docs/diary.md +++ b/docs/diary.md @@ -0,0 +1,20 @@ +# TODOs +* Write a script that makes tables and inserts the data. +* Find out if data cleaning can be done in python with pandas or if it all must be SQL scipts. + +# Project Diary + +| Version
0.00 | Author:
michel.romancuk@stud.unibas.ch
sebastian.lenzlinger@unibas.ch
| HS 2023
Databases
| +|-----------------------|-------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------| +| Date | | Problems | +| Oktober/ pre 16.11.23 | Decision to use postrges server.
Server setup.
Setup pgadmin at [pgadmin.slenzlinger.dev](pgadmin.slenzlinger.dev) | | +| 16.11.23 | Setup Repo and written some instructions into ``wiki.md`` on how to setup environment. | Realize steps not written down how postgres, pgadmin, nginx etc. was setup at the time. | +| | | | +| | | | +| | | | +| | | | +| | | | +| | | | +| | | | +| | | | +| | \ No newline at end of file diff --git a/docs/foot_bike_zaehlung_urls.txt b/docs/foot_bike_zaehlung_urls.txt new file mode 100644 index 0000000..00f6353 --- /dev/null +++ b/docs/foot_bike_zaehlung_urls.txt @@ -0,0 +1,11 @@ +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2012_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2013_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2014_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2015_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2016_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2017_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2018_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2019_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2020_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2021_verkehrszaehlungen_werte_fussgaenger_velo.csv +https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2022_verkehrszaehlungen_werte_fussgaenger_velo.csv \ No newline at end of file diff --git a/docs/verkehrszaehlung_moto_urls.txt b/docs/verkehrszaehlung_moto_urls.txt new file mode 100644 index 0000000..427888d --- /dev/null +++ b/docs/verkehrszaehlung_moto_urls.txt @@ -0,0 +1,11 @@ +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2012.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2013.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2014.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2015.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2016.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2021.csv +https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2022.csv \ No newline at end of file diff --git a/docs/wiki.md b/docs/wiki.md index e69de29..cb937d3 100644 --- a/docs/wiki.md +++ b/docs/wiki.md @@ -0,0 +1,25 @@ +# Setup Of Virtual Python dev env +First open the terminal and make sure to be in the root directory. +All steps assume one is in the root folder. +## Create Virtual environment +``` +python3 -m venv db23-project-venv +``` +## Activating the virtual environment +``` +source db23/bin/activate +``` +#### When in the environment ``db23-project`` just install all needed packages. +``` +pip3 install pkg_name +``` +## Getting back out +``` +deactivate +``` + +# List of used packages +See ``requirements.txt`` + +# Setting up postgres +# Setting up pgadmin as container serverd by nginx \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..49eb76e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,112 @@ +anyio==4.0.0 +appnope==0.1.3 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==23.1.0 +Babel==2.13.1 +beautifulsoup4==4.12.2 +black==23.11.0 +bleach==6.1.0 +certifi==2023.7.22 +cffi==1.16.0 +charset-normalizer==3.3.2 +click==8.1.7 +click-plugins==1.1.1 +cligj==0.7.2 +comm==0.2.0 +debugpy==1.8.0 +decorator==5.1.1 +defusedxml==0.7.1 +executing==2.0.1 +fastjsonschema==2.19.0 +fiona==1.9.5 +fqdn==1.5.1 +GeoAlchemy2==0.14.2 +geopandas==0.14.1 +idna==3.4 +ipykernel==6.26.0 +ipython==8.17.2 +ipywidgets==8.1.1 +isoduration==20.11.0 +jedi==0.19.1 +Jinja2==3.1.2 +json5==0.9.14 +jsonpointer==2.4 +jsonschema==4.19.2 +jsonschema-specifications==2023.11.1 +jupyter==1.0.0 +jupyter-console==6.6.3 +jupyter-events==0.9.0 +jupyter-lsp==2.2.0 +jupyter_client==8.6.0 +jupyter_core==5.5.0 +jupyter_server==2.10.1 +jupyter_server_terminals==0.4.4 +jupyterlab==4.0.8 +jupyterlab-pygments==0.2.2 +jupyterlab-widgets==3.0.9 +jupyterlab_server==2.25.1 +MarkupSafe==2.1.3 +matplotlib-inline==0.1.6 +mistune==3.0.2 +mypy-extensions==1.0.0 +nbclient==0.9.0 +nbconvert==7.11.0 +nbformat==5.9.2 +nest-asyncio==1.5.8 +notebook==7.0.6 +notebook_shim==0.2.3 +numpy==1.26.2 +overrides==7.4.0 +packaging==23.2 +pandas==2.1.3 +pandocfilters==1.5.0 +parso==0.8.3 +pathspec==0.11.2 +pexpect==4.8.0 +platformdirs==4.0.0 +prometheus-client==0.18.0 +prompt-toolkit==3.0.41 +psutil==5.9.6 +psycopg2==2.9.9 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pycparser==2.21 +Pygments==2.16.1 +pyproj==3.6.1 +python-dateutil==2.8.2 +python-json-logger==2.0.7 +pytz==2023.3.post1 +PyYAML==6.0.1 +pyzmq==25.1.1 +qtconsole==5.5.1 +QtPy==2.4.1 +referencing==0.31.0 +requests==2.31.0 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.13.0 +Send2Trash==1.8.2 +shapely==2.0.2 +six==1.16.0 +sniffio==1.3.0 +soupsieve==2.5 +SQLAlchemy==2.0.23 +stack-data==0.6.3 +terminado==0.18.0 +tinycss2==1.2.1 +tornado==6.3.3 +traitlets==5.13.0 +types-python-dateutil==2.8.19.14 +typing_extensions==4.8.0 +tzdata==2023.3 +uri-template==1.3.0 +urllib3==2.1.0 +wcwidth==0.2.10 +webcolors==1.13 +webencodings==0.5.1 +websocket-client==1.6.4 +widgetsnbextension==4.0.9 diff --git a/src/data_utils.py b/src/data_utils.py new file mode 100644 index 0000000..8dcdcd5 --- /dev/null +++ b/src/data_utils.py @@ -0,0 +1,139 @@ +import json +import os +import pandas as pd +import requests +from urllib.parse import urlparse +import geopandas as gpd +from concurrent.futures import ThreadPoolExecutor as tpe +import logging + +logging.basicConfig(level=logging.DEBUG, filename='logs/data_utils.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger('data_utils.py') +stream_handler = logging.StreamHandler() +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) + + +def download_csv(url, local_filename): + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + +def process_urls(data_dir, urls_file): + # Ensure the data directory exists + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + # Read URLs from the file + with open(urls_file, 'r') as file: + urls = file.readlines() + + # Process each URL + for url in urls: + url = url.strip() + filename = os.path.basename(urlparse(url).path) + local_filename = os.path.join(data_dir, filename) + + # Check if the file already exists + if not os.path.isfile(local_filename): + logger.debug(f"Downloading {url}...") + download_csv(url, local_filename) + logger.debug(f"Saved to {local_filename}") + else: + print(f"File {filename} already exists in {data_dir}, skipping download.") + + +def load_dataframe_from_csv(filepath): + try: + df = pd.read_csv(filepath, low_memory=False) + return df + except Exception as e: + logger.error(f"Error loading {filepath}: {e}") + return None + + +def load_dataframes_from_csv_files(data_dir, u_string): + dataframes = [] + + # with tpe(max_workers=5) as executor: + # for filename in os.listdir(data_dir): + # if (u_string in filename) and filename.endswith('.csv'): + # filepath = os.path.join(data_dir, filename) + # future = executor.submit(load_dataframe_from_csv, filepath) + # dataframes.append(future) + # + # dataframes = [future.result() for future in dataframes if future.result() is not None] + # + # return dataframes + + for filename in os.listdir(data_dir): + if (u_string in filename) and filename.endswith('.csv'): + filepath = os.path.join(data_dir, filename) + df = pd.read_csv(filepath, low_memory=False) + logger.debug(f'Duplicate Rows for {filename}: {df[df.duplicated()].shape[0]}') + df = df.drop_duplicates() + logger.debug(f'Duplicate Rows after DROPPING for {filename}: {df[df.duplicated()].shape[0]}') + dataframes.append(df.drop_duplicates()) + return dataframes + + +def load_dataframes_from_geojson_files(data_dir, u_string): + print('u_string', u_string) + gdf = gpd.GeoDataFrame() + for filename in os.listdir(data_dir): + #print("Filename:", filename) + if (u_string in filename) and filename.endswith('.json'): + filepath = os.path.join(data_dir, filename) + print("Filepath:", filepath) + gdf = gpd.read_file(filepath) + + return gdf + + +def combine_dataframes(dataframes): + if dataframes: + combined_dataframe = pd.concat(dataframes, ignore_index=True) + logger.debug(f'Duplicate Rows after combining: {combined_dataframe[combined_dataframe.duplicated()]}') + return combined_dataframe + else: + print("No dataframes to combine") + return pd.DataFrame() + + +def create_unified_df(urls_file, u_string, data_dir, files_present=False): + df_list = [] + df_unified = None + if not files_present: + process_urls(data_dir, urls_file) + + df_list = load_dataframes_from_csv_files(data_dir, u_string) + df_unified = combine_dataframes(df_list) + + return df_unified + + +def load_file_from_api(api_link, target_name, integrated_dir): + response = requests.get(api_link) + final_location = os.path.join(integrated_dir, target_name) + if response.status_code == 200: + logger.info(f"Succesfull get from {api_link}") + data = response.json() + with open(f'{final_location}.geojson', 'w') as file: + json.dump(data, file) + logger.info(f"{api_link} successfully downloaded and saved to {final_location}") + else: + logger.critical(f"Failed to get data. Status Code: {response.status_code}") +def save_dataframe_to_csv(df, integrated_dir, filename): + pass + + +if __name__ == "__main__": + csv_urls_file = '../docs/all_csv_urls.txt' + datasets_dir = 'datasets/' + output_file = 'column_names.txt' + process_urls(datasets_dir, csv_urls_file) + # extract_column_names(datasets_dir, output_file) diff --git a/src/ensure_dirs_exist.py b/src/ensure_dirs_exist.py new file mode 100644 index 0000000..2ac3d57 --- /dev/null +++ b/src/ensure_dirs_exist.py @@ -0,0 +1,26 @@ +import logging +import os +""" +The functionality of this script has been adapted from data_utils.ensure_dirs_exist(). +This needs to be run before any other script. +""" +data_dir = 'datasets/' +integrated_dir = 'datasets/integrated/' +logs_dir = 'logs/' + +logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger('integrate.py') +stream_handler = logging.StreamHandler() +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) + +logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}') +logger.info("Ensuring needed directories exist.") +os.makedirs(data_dir, exist_ok=True) +logger.debug("data_dir created.") +os.makedirs(integrated_dir, exist_ok=True) +logger.debug("integrated_dir created") +os.makedirs(logs_dir, exist_ok=True) +logger.debug("logs_dir created") diff --git a/src/fill_db.py b/src/fill_db.py new file mode 100644 index 0000000..003b451 --- /dev/null +++ b/src/fill_db.py @@ -0,0 +1,108 @@ +import logging +import psycopg2 +import subprocess + +logging.basicConfig(level=logging.DEBUG, filename='logs/fill_db.log', + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger('fill_db.py') +stream_handler = logging.StreamHandler() +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) + +integrated_dir = 'datasets/integrated/' +accident_geojson_file = 'datasets/integrated/Accidents.geojson' +signaled_speeds_file = 'datasets/integrated/signaled_speeds.geojson.geojson' +accident_loader_script = 'load_accidents_into_db.sh' +accident_table_name = 'accidents' +signaled_speeds_table_name = 'signaled_speeds' + +""" +Make sure db_info contain the correct credentials +""" +db_info = { + 'host': 'localhost', + 'database': 'test-db23', + 'port': '5432', + 'user': 'seb', + 'password': '', +} +setup_tables_script = 'setup_tables.sql' +load_csvs_into_db_script = 'load_csvs_into_db.sql' + + +def run_sql(script, db_info): + db_connection = psycopg2.connect(**db_info) + db_cursor = db_connection.cursor() + + with open(script, 'r') as sql_file: + sql_script = sql_file.read() + + try: + db_cursor.execute(sql_script) + db_connection.commit() + logger.info(f'{script} executed successfully') + except Exception as e: + db_connection.rollback() + logger.exception(f'Error executing {sql_script}: {e}') + finally: + db_cursor.close() + db_connection.close() + + +def run_geojson_loader_script(script, *args): + try: + cmd = ['bash', script] + list(args) + res = subprocess.run(cmd, check=True, text=True, capture_output=True) + logger.info(f'{script} executed successfully. Output: {res.stdout}') + except subprocess.CalledProcessError as e: + logger.exception(f'Error executing {script}: {e}') + logger.info(f"Remember to set the correct permissions for the script: chmod +x {script}") + + +def geojson_loader(*args, modus='append'): + """ + Use this instead of run_geojson_loader_script() in the main method to avoid the bash dependency. + :param args: All the arguments needed for ogr2org to run properly + :param modus: append or overwrite db table + :return: + """ + geojson_file, db_name, db_user, db_password, db_host, db_port, target_table = args + cmd = [ + "ogr2ogr", + "-f", "PostgreSQL", + f"PG:dbname='{db_name}' host='{db_host}' port='{db_port}' user='{db_user}' password='{db_password}'", + geojson_file, + "-nln", target_table, + f"-{modus}" + ] + try: + # Run the command + res = subprocess.run(cmd, check=True, text=True, capture_output=True) + logger.info(f"ogr2ogr command executed successfully. Output: {res.stdout}") + except subprocess.CalledProcessError as e: + logger.exception(f"Error executing ogr2ogr command: {e}") + + +if __name__ == '__main__': + run_sql(setup_tables_script, db_info) + logger.info("Finnished setting up tables.") + run_sql(load_csvs_into_db_script, db_info) + logger.info("Finnished loading csv into db.") + run_geojson_loader_script(accident_loader_script, + accident_geojson_file, + db_info['database'], + db_info['user'], + db_info['password'], + db_info['host'], + db_info['port'], + accident_table_name) + logger.info('Finished loading accident geojson into db using bash script.') + geojson_loader(signaled_speeds_file, + db_info['database'], + db_info['user'], + db_info['password'], + db_info['host'], + db_info['port'], + signaled_speeds_table_name, + modus='overwrite') diff --git a/src/integrate.py b/src/integrate.py new file mode 100644 index 0000000..d37d174 --- /dev/null +++ b/src/integrate.py @@ -0,0 +1,245 @@ +import data_utils as du +import os +import pandas as pd +import geopandas as gpd +import time +from shapely.geometry import Point +import re + +import logging + +logging.basicConfig(level=logging.DEBUG, filename='logs/integrate.log', + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger('integrate.py') +stream_handler = logging.StreamHandler() +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) + +foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt' +miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt' +accident_file_url = '../docs/accident_loc_urls.txt' + +# Using u_string to discriminate between files that belong to each other +motor_file_u_string = 'sid_dav_verkehrszaehlung_miv_OD2031' +foot_bike_file_u_string = 'velo.csv' +accident_file_u_string = 'RoadTrafficAccidentLocations.json' + +data_dir = 'datasets/' +integrated_dir = 'datasets/integrated/' +logs_dir = 'logs/' + +signaled_speeds_json_api = 'https://www.ogd.stadt-zuerich.ch/wfs/geoportal/Signalisierte_Geschwindigkeiten?service=WFS&version=1.1.0&request=GetFeature&outputFormat=GeoJSON&typename=view_geoserver_tempo_ist' + +weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] + +fb_data_types = { + 'ID': 'int', + 'NORD': 'int', + 'OST': 'int', + 'DATE': 'str', + 'HRS': 'int', + 'VELO_IN': 'int', + 'VELO_OUT': 'int', + 'FUSS_IN': 'int', + 'FUSS_OUT': 'int', + 'Weekday_en': 'str' +} + +miv_data_types = { + 'ID': 'int', + 'MSID': 'str', + 'ZSID': 'str', + 'Achse': 'str', + 'NKoord': 'int', + 'EKoord': 'int', + 'Richtung': 'str', + 'AnzFahrzeuge': 'int', + 'AnzFahrzeugeStatus': 'str', + 'Datum': 'str', + 'Hrs': 'int', + 'Weekday_en': 'str', +} + +acc_data_types = { + 'AccidentUID': 'str', + 'AccidentYear': 'int', + 'AccidentMonth': 'int', + 'AccidentWeekDay_en': 'str', + 'AccidentHour': 'int', + 'NKoord': 'int', + 'EKoord': 'int', + 'AccidentType_en': 'str', + 'AccidentType': 'str', + 'AccidentSeverityCategory': 'str', + 'AccidentInvolvingPedestrian': 'bool', + 'AccidentInvolvingBicycle': 'bool', + 'AccidentInvolvingMotorcycle': 'bool', + 'RoadType': 'str', + 'RoadType_en': 'str', + 'geometry': 'str' # TODO: Figure out what dtype this needs to be for postgres +} +# def ensure_dirs_exist(data_dir, integrated_dir, logs_dir): +# """ +# This should be called before anything else to make sure that the relevant directories exists. +# :param data_dir: directory where the datasets are stored +# :param integrated_dir: directory where the integrated data will be stored +# :return: +# """ +# logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}') +# logger.info("Ensuring needed directories exist.") +# os.makedirs(data_dir, exist_ok=True) +# logger.debug("data_dir created.") +# os.makedirs(integrated_dir, exist_ok=True) +# logger.debug("integrated_dir created") +# os.makedirs(logs_dir, exist_ok=True) +# logger.debug("logs_dir created") + + +def process_foot_bike_data(files_present=True): + fb_df_unified = du.create_unified_df(foot_bike_urls_file, foot_bike_file_u_string, data_dir, + files_present=files_present) + fb_df_unified[['DATE', "TIME"]] = fb_df_unified['DATUM'].str.split('T', expand=True) + fb_df_unified[['HRS', 'MINS']] = fb_df_unified['TIME'].str.split(':', expand=True) + ## Evt brauchen wir doch FK_ZAEHLER + fb_cols_to_drop = ['DATUM'] + fb_df_unified_correct_cols = fb_df_unified.drop(columns=fb_cols_to_drop, axis=1) + fb_df_unified_correct_cols.fillna(0, inplace=True) + fb_df_grouped = fb_df_unified_correct_cols.groupby(['OST', 'NORD', 'DATE', 'HRS']).agg({ + 'VELO_IN': 'sum', + 'VELO_OUT': 'sum', + 'FUSS_IN': 'sum', + 'FUSS_OUT': 'sum' + }).reset_index() + dt_obj = pd.to_datetime(fb_df_grouped['DATE']) + days = dt_obj.dt.weekday + fb_df_grouped['Weekday_en'] = days.map(lambda x: weekday_names[x]) + cleaned_fb_df = fb_df_grouped + cleaned_fb_df['ID'] = cleaned_fb_df.index + 1 + cleaned_fb_df = cleaned_fb_df[['ID', 'NORD', 'OST', 'DATE', 'HRS', 'VELO_IN', 'VELO_OUT', 'FUSS_IN', + 'FUSS_OUT', 'Weekday_en']] + # Ensure datatype of df and sql table match + cleaned_fb_df = cleaned_fb_df.astype(fb_data_types) + return cleaned_fb_df + + +def process_miv_data(files_present=True): + miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir, files_present=files_present) + + miv_df_unified[['Datum', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True) + miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True) + + miv_cols_to_keep = ['MSID','ZSID','Achse', 'NKoord', 'EKoord', 'Richtung', 'AnzFahrzeuge', 'AnzFahrzeugeStatus', + 'Datum', 'Hrs',] + miv_df_cols_dropped = miv_df_unified[miv_cols_to_keep] + + dt_obj = pd.to_datetime(miv_df_cols_dropped['Datum']) + days = dt_obj.dt.weekday + miv_df_cols_dropped.loc[:, 'Weekday_en'] = days.map(lambda x: weekday_names[x]) + + miv_df_cols_dropped.loc[:, 'AnzFahrzeuge'] = miv_df_cols_dropped['AnzFahrzeuge'].fillna(0).astype(int) + miv_df_cols_dropped[:, 'ZSID'] = miv_df_cols_dropped['ZSID'].fillna('Missing').astype(str) + miv_df_cols_dropped['ID'] = (miv_df_cols_dropped.index + 1).copy() + + cleaned_miv_df = miv_df_cols_dropped[['ID', 'MSID', 'ZSID', 'Achse', 'NKoord', 'EKoord', 'Richtung', 'AnzFahrzeuge', + 'AnzFahrzeugeStatus', 'Datum', 'Hrs', 'Weekday_en']] + + cleaned_miv_df = cleaned_miv_df.astype(miv_data_types) + cleaned_miv_df = cleaned_miv_df.drop_duplicates() + return cleaned_miv_df + + +def process_accident_data(file_present: bool = True): + if not file_present: + du.process_urls(data_dir, accident_file_url) + acc_df_unified = du.load_dataframes_from_geojson_files(data_dir, accident_file_u_string) + acc_cols_to_keep = ['AccidentUID', 'AccidentYear', 'AccidentMonth', 'AccidentWeekDay_en','AccidentHour', + 'AccidentLocation_CHLV95_N', 'AccidentLocation_CHLV95_E', 'AccidentType_en', 'AccidentType', + 'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle', + 'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', + 'geometry'] + cleaned_acc_df = acc_df_unified[acc_cols_to_keep] + cleaned_acc_df.rename(columns={ + 'AccidentLocation_CHLV95_E': 'EKoord', + 'AccidentLocation_CHLV95_N': 'NKoord', + }, inplace=True) + + cleaned_acc_df = cleaned_acc_df.astype(acc_data_types) + return cleaned_acc_df + + +def process_all_data_sources(fb_present=True, miv_present=True, accident_present=True): + """ + Process all data sources and turn them in to csv files. After this function is called there + should be csv files of the cleaned and integrated data sources + + :param fb_present: bool, if the files present in local file system + :param miv_present: bool, if the files present in local file system + :param accident_present: bool, if the files present in local file system + :return: + """ + # ensure_dirs_exist(data_dir, integrated_dir) + logger.info("Started processing all data sources.") + fb_to_integrated(fb_present) + + miv_to_integrated_csv(miv_present) + + acc_to_cleaned_geojson(accident_present) + + +def fb_to_integrated(files_present=True): + + start_time = time.time() + logger.info("Start processing pedestrian and bicycle data (FootBikeCount)") + fb_count_df = process_foot_bike_data(files_present) + logger.debug(f'FB Head:{fb_count_df.head()}\n FB dtypes: {fb_count_df.dtypes}') + fb_file_path = os.path.join(integrated_dir, 'FootBikeCount.csv') + logger.debug(f'FB Cleaned File Path: {fb_file_path}') + fb_count_df.to_csv(fb_file_path, index=False) + logger.info("FB integrated csv created.") + end_time = time.time() + logger.info(f'Time taken for FootBikeCount: {end_time-start_time}') + + +def miv_to_integrated_csv(miv_present=True): + + start_time2 = time.time() + logger.info("Start processing motorized vehicle data (MivCount)") + miv_count_df = process_miv_data(miv_present) + logger.debug(f'MIV Head:{miv_count_df.head()}\n MIV dtypes: {miv_count_df.dtypes}') + miv_file_path = os.path.join(integrated_dir, 'MivCount.csv') + logger.debug(f'MIV Cleaned File Path: {miv_file_path}') + miv_count_df.to_csv(miv_file_path, index=False) + logger.info("MIV integrated csv created.") + end_time = time.time() + logger.info(f'Time taken for MivCount: {end_time-start_time2}') + + +def acc_to_cleaned_geojson(acc_present=True): + start_time3 = time.time() + logger.info("Start processing accident data (Accidents)") + acc_df = process_accident_data(acc_present) + logger.debug(f'ACC Head: { acc_df.head()}\n Acc dtypes: {acc_df.dtypes}') + acc_file_path = os.path.join(integrated_dir, 'Accidents.geojson') + logger.debug(f'Acc Cleaned file path: {acc_file_path}') + acc_df['geometry'] = acc_df['geometry'].apply(lambda row: re.findall(r"[-+]?\d*\.\d+|\d+", row)) + # Create a Point object using the extracted coordinates + acc_df['geometry'] = acc_df['geometry'].apply( + lambda coords: Point(float(coords[0]), float(coords[1]), float(coords[2]))) + acc_gdf = gpd.GeoDataFrame(acc_df, geometry='geometry') + acc_gdf.to_file(acc_file_path, driver='GeoJSON') + logger.info("ACC integrated csv created.") + end_time = time.time() + logger.info(f'Time taken for Accidents: {end_time - start_time3}') + + +def load_tempo_geojson_from_api_to_local(): + du.load_file_from_api(signaled_speeds_json_api, 'signaled_speeds.geojson', integrated_dir) + + +if __name__ == '__main__': + # ensure_dirs_exist(data_dir, integrated_dir, logs_dir) + # process_all_data_sources(True, True, True) + # miv_to_integrated_csv() + # acc_to_cleaned_geojson() + load_tempo_geojson_from_api_to_local() diff --git a/src/load_accidents_into_db.sh b/src/load_accidents_into_db.sh new file mode 100644 index 0000000..9593d63 --- /dev/null +++ b/src/load_accidents_into_db.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Define parameters +GEOJSON_FILE=$1 +DB_NAME=$2 +DB_USER=$3 +DB_PASSWORD=$4 +DB_HOST=$5 +DB_PORT=$6 +TARGET_TABLE=$7 + +# Run ogr2ogr command +ogr2ogr -f "PostgreSQL" PG:"dbname='$DB_NAME' host='$DB_HOST' port='$DB_PORT' user='$DB_USER' password='$DB_PASSWORD'" "$GEOJSON_FILE" -nln $TARGET_TABLE -append + +echo "GeoJSON data has been imported into $TARGET_TABLE" diff --git a/src/load_csvs_into_db.sql b/src/load_csvs_into_db.sql new file mode 100644 index 0000000..77ceb25 --- /dev/null +++ b/src/load_csvs_into_db.sql @@ -0,0 +1,7 @@ +COPY FootBikeCount FROM '/Users/seb/Projects/repos/group-1/src/datasets/integrated/FootBikeCount.csv' + DELIMITER ',' + CSV HEADER; + +COPY MivCount FROM '/Users/seb/Projects/repos/group-1/src/datasets/integrated/MivCount.csv' + DELIMITER ',' + CSV HEADER; \ No newline at end of file diff --git a/src/queries.sql b/src/queries.sql new file mode 100644 index 0000000..f127869 --- /dev/null +++ b/src/queries.sql @@ -0,0 +1,42 @@ +select p.id, a.accidentuid, m.id +from footbikecount p, accidents a, mivcount m +where p.weekday_en = a.accidentweekday_en AND a.accidentweekday_en = m.weekday_en +AND p.weekday_en = m.weekday_en AND p.hrs = a.accidenthour AND a.accidenthour = m.hrs +AND p.hrs = m.hrs AND (p.ost - m.ekoord between -100 AND 100) AND (p.nord - m.nkoord between -100 AND 100); + +DROP TABLE IF EXISTS Contemporaneous2; + +CREATE TABLE Contemporaneous2 ( + p_id INTEGER, + accidentuid VARCHAR(256), + m_id INTEGER, + weekday_en VARCHAR(10), + hrs INTEGER, + distance DOUBLE PRECISION +); + + +CREATE TABLE Intermediate2 AS +SELECT + p.id AS p_id, + a.accidentuid, + m.id AS m_id, + p.weekday_en, + p.hrs, + SQRT(POWER(p.ost - m.ekoord, 2) + POWER(p.nord - m.nkoord, 2)) AS distance +FROM + footbikecount p, + accidents a, + mivcount m +WHERE + p.weekday_en = a.accidentweekday_en + AND a.accidentweekday_en = m.weekday_en + AND p.weekday_en = m.weekday_en + AND p.hrs = a.accidenthour + AND a.accidenthour = m.hrs + AND p.hrs = m.hrs + AND (p.ost - m.ekoord BETWEEN -100 AND 100) + AND (p.nord - m.nkoord BETWEEN -100 AND 100); + +INSERT INTO Contemporaneous2 (p_id, accidentuid, m_id, weekday_en, hrs, distance) +SELECT p_id, accidentuid, m_id, weekday_en, hrs, distance FROM Intermediate2; diff --git a/src/setup_tables.sql b/src/setup_tables.sql new file mode 100644 index 0000000..3a9881e --- /dev/null +++ b/src/setup_tables.sql @@ -0,0 +1,72 @@ +CREATE EXTENSION IF NOT EXISTS postgis; + +DROP TABLE IF EXISTS FootBikeCount; + +DROP TABLE IF EXISTS Accidents; + +DROP TABLE IF EXISTS MivCount; + + +CREATE TABLE FootBikeCount ( + ID INTEGER , + NORD INTEGER , + OST INT , + DATE VARCHAR(10) , + HRS INTEGER , + VELO_IN INTEGER , + VELO_OUT INTEGER , + FUSS_IN INTEGER , + FUSS_OUT INTEGER , + Weekday_en VARCHAR(10) , + + PRIMARY KEY(ID) , + CHECK (Weekday_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')), + CHECK (Hrs BETWEEN 0 AND 23) + + +); + + + +CREATE TABLE MivCount ( + ID INTEGER , + MSID VARCHAR(10) , + ZSID VARCHAR(10) , + Achse VARCHAR(256) , + NKoord INTEGER , + EKoord INTEGER , + Richtung VARCHAR(100) , + AnzFahrzeuge INTEGER , + AnzFahrzeugeStatus VARCHAR(20) , + Datum VARCHAR(10) , + Hrs Integer , + Weekday_en VARCHAR(10), + + PRIMARY KEY (ID), + CHECK (Weekday_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')), + CHECK (Hrs BETWEEN 0 AND 23) +); + + +CREATE TABLE Accidents ( + AccidentUID VARCHAR(256) , + AccidentYear INTEGER , + AccidentMonth INTEGER, + AccidentWeekDay_en VARCHAR(10) , + AccidentHour INTEGER , + NKoord INTEGER , + EKoord INTEGER , + AccidentType_en VARCHAR(256) , + AccidentType VARCHAR(4) , + AccidentSeverityCategory VARCHAR(4) , + AccidentInvolvingPedestrian BOOLEAN , + AccidentInvolvingBicycle BOOLEAN , + AccidentInvolvingMotorcycle BOOLEAN , + RoadType VARCHAR(5) , + RoadType_en VARCHAR(256) , + Geometry geometry(Point, 4326) , + + PRIMARY KEY (AccidentUID) , + CHECK ( AccidentHour BETWEEN 0 AND 23) , + CHECK (AccidentWeekDay_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')) +); \ No newline at end of file diff --git a/src/testArea.ipynb b/src/testArea.ipynb new file mode 100644 index 0000000..c0bcbf8 --- /dev/null +++ b/src/testArea.ipynb @@ -0,0 +1,233 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from datetime import datetime as dt\n", + "\n", + "import integrate as intr\n", + "\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T15:47:10.110909Z", + "start_time": "2023-12-03T15:47:09.656556Z" + } + }, + "id": "be55b25929d95559" + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/seb/Projects/repos/group-1/src/integrate.py:132: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " miv_df_cols_dropped['Weekday_en'] = days.map(lambda x: weekday_names[x])\n", + "/Users/seb/Projects/repos/group-1/src/integrate.py:133: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " miv_df_cols_dropped['AnzFahrzeuge'] = miv_df_cols_dropped['AnzFahrzeuge'].fillna(0).astype(int)\n" + ] + } + ], + "source": [ + "\n", + "miv_df = intr.process_miv_data()\n", + "#fb_data = intr.process_foot_bike_data()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T15:49:07.561603Z", + "start_time": "2023-12-03T15:47:14.759104Z" + } + }, + "id": "dd3831953afdeb72" + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "121\n" + ] + } + ], + "source": [ + "duplicate_rows = miv_df[miv_df.duplicated()]\n", + "print(duplicate_rows.shape[0])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T15:51:21.158909Z", + "start_time": "2023-12-03T15:51:15.711222Z" + } + }, + "id": "14471cd78389ce4d" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "acc_df = intr.process_accident_data(True)" + ], + "metadata": { + "collapsed": false + }, + "id": "f86bc612060b17a4" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "acc_df.head()\n", + "acc_df['AccidentWeekDay_en'].unique()\n", + "#acc_df.dtypes\n", + "\n" + ], + "metadata": { + "collapsed": false + }, + "id": "6affbeea6c7cf3ef" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "print(\"Accident Columns:\")\n", + "print(acc_df.dtypes)\n", + "print()\n", + "print(\"MIV Columns:\")\n", + "print(miv_df.dtypes)\n", + "print()\n", + "print(\"FB Cols:\")\n", + "print(fb_data.dtypes)" + ], + "metadata": { + "collapsed": false + }, + "id": "242041cd369d8454" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "acc_df['ID'] = acc_df.index +1\n", + "acc_df[('ID')]" + ], + "metadata": { + "collapsed": false + }, + "id": "1841925ee109a417" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "print(\"MIV unqiue:\", miv_df['EKoord'])\n", + "print(\"Acc unique:\", acc_df['RoadType'].unique)\n", + "print(\"FB unique: \", fb_data['DATE'])\n" + ], + "metadata": { + "collapsed": false + }, + "id": "f6d752ea17eda341" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "acc_df.head()" + ], + "metadata": { + "collapsed": false + }, + "id": "a159cafa9c227b88" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine\n", + "from geoalchemy2 import Geometry, WKTElement\n", + "import geopandas as gpd\n", + "from shapely import wkt\n", + "\n", + "db_url = f'postgresql://seb:@localhost:5432/test-db23'\n", + "engine = create_engine(db_url)\n", + "\n", + "#miv_df.to_sql('table_name', engine, if_exists='replace', index=False)\n", + "#fb_data.to_sql('footbike', engine, if_exists='replace', index=False)\n", + "\n", + "geometry_column = 'geometry'\n", + "\n", + "\n", + "acc_df['geometry'] = acc_df['geometry'].apply(lambda geom: WKTElement(geom.wkt, srid=4326))\n", + "\n", + "acc_df.to_sql('accidents', engine, if_exists='replace', index=False, dtype={'geometry': Geometry('POINT', srid=4326)})\n", + "\n" + ], + "metadata": { + "collapsed": false + }, + "id": "fa76af8343443d7a" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "engine.dispose()" + ], + "metadata": { + "collapsed": false + }, + "id": "bc0a23a5126e76c2" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}