Merge branch 'dev-explore' into 'main'

Milestone Merge

See merge request dbis/lecture-groups/database-systems/2023hs/group-1!3
This commit is contained in:
Sebastian Lenzlinger 2023-12-03 22:58:31 +00:00
commit 8be3279ace
19 changed files with 1149 additions and 60 deletions

1
.gitignore vendored
View File

@ -1,6 +1,7 @@
# Created by https://www.toptal.com/developers/gitignore/api/python,pycharm,linux,macos,database,data
# Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm,linux,macos,database,data
datasets/
db23-project-venv/
### Data ###
*.csv
*.dat

View File

@ -2,14 +2,18 @@
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 1,
"id": "17ca2acb",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"tags": []
"tags": [],
"ExecuteTime": {
"end_time": "2023-11-16T16:20:49.426349Z",
"start_time": "2023-11-16T16:20:16.117316Z"
}
},
"outputs": [
{
@ -156,7 +160,11 @@
"2 Unbekannt 2016-01-01T02:00:00 2021-02-03 213.0 Gemessen \n",
"3 Unbekannt 2016-01-01T03:00:00 2021-02-03 112.0 Gemessen \n",
"4 Unbekannt 2016-01-01T04:00:00 2021-02-03 80.0 Gemessen \n",
"Data for year 2017:\n",
"File not found for year 2017: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv\n",
"File not found for year 2018: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv\n",
"File not found for year 2019: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv\n",
"File not found for year 2020: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv\n",
"Data for year 2021:\n",
" MSID MSName ZSID ZSName Achse \\\n",
"0 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
"1 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
@ -179,34 +187,39 @@
"4 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
"\n",
" D4ID MessungDatZeit LieferDat AnzFahrzeuge AnzFahrzeugeStatus \n",
"0 Unbekannt 2017-01-01T00:00:00 2021-02-03 295.0 Gemessen \n",
"1 Unbekannt 2017-01-01T01:00:00 2021-02-03 264.0 Gemessen \n",
"2 Unbekannt 2017-01-01T02:00:00 2021-02-03 180.0 Gemessen \n",
"3 Unbekannt 2017-01-01T03:00:00 2021-02-03 107.0 Gemessen \n",
"4 Unbekannt 2017-01-01T04:00:00 2021-02-03 97.0 Gemessen \n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_311061/2135127822.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlow_memory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mtable_key\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"miv_{year}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 946\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 947\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 948\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 950\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 615\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 617\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 618\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1746\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1747\u001b[0m \u001b[0mcol_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1748\u001b[0;31m \u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1749\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1750\u001b[0m )\n",
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 239\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 240\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers._maybe_upcast\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/numpy/core/multiarray.py\u001b[0m in \u001b[0;36mputmask\u001b[0;34m(a, mask, values)\u001b[0m\n\u001b[1;32m 1129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1130\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1131\u001b[0;31m \u001b[0;34m@\u001b[0m\u001b[0marray_function_from_c_func_and_dispatcher\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_multiarray_umath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mputmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1132\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mputmask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m/\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1133\u001b[0m \"\"\"\n",
"\u001b[0;32m_pydevd_bundle/pydevd_cython.pyx\u001b[0m in \u001b[0;36m_pydevd_bundle.pydevd_cython.SafeCallWrapper.__call__\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m_pydevd_bundle/pydevd_cython.pyx\u001b[0m in \u001b[0;36m_pydevd_bundle.pydevd_cython.ThreadTracer.__call__\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/debugpy/_vendored/pydevd/_pydev_bundle/pydev_is_thread_alive.py\u001b[0m in \u001b[0;36mis_thread_alive\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_temp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'_is_stopped'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Python 3.x has this\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mis_thread_alive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_stopped\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
"0 Unbekannt 2021-01-01T00:00:00 2021-12-30 122.0 Gemessen \n",
"1 Unbekannt 2021-01-01T01:00:00 2021-12-30 177.0 Gemessen \n",
"2 Unbekannt 2021-01-01T02:00:00 2021-12-30 125.0 Gemessen \n",
"3 Unbekannt 2021-01-01T03:00:00 2021-12-30 84.0 Gemessen \n",
"4 Unbekannt 2021-01-01T04:00:00 2021-12-30 49.0 Gemessen \n",
"Data for year 2022:\n",
" MSID MSName ZSID ZSName Achse \\\n",
"0 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
"1 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
"2 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
"3 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
"4 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
"\n",
" HNr Hoehe EKoord NKoord Richtung Knummer \\\n",
"0 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n",
"1 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n",
"2 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n",
"3 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n",
"4 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n",
"\n",
" Kname AnzDetektoren D1ID D2ID D3ID \\\n",
"0 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
"1 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
"2 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
"3 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
"4 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
"\n",
" D4ID MessungDatZeit LieferDat AnzFahrzeuge AnzFahrzeugeStatus \n",
"0 Unbekannt 2022-01-01T00:00:00 2022-12-30 166.0 Gemessen \n",
"1 Unbekannt 2022-01-01T01:00:00 2022-12-30 255.0 Gemessen \n",
"2 Unbekannt 2022-01-01T02:00:00 2022-12-30 168.0 Gemessen \n",
"3 Unbekannt 2022-01-01T03:00:00 2022-12-30 96.0 Gemessen \n",
"4 Unbekannt 2022-01-01T04:00:00 2022-12-30 63.0 Gemessen \n"
]
}
],
@ -219,7 +232,8 @@
"\n",
"tables_dict = {}\n",
"pd.set_option(\"display.max_columns\", None)\n",
"data_dir = 'data/'\n",
"data_dir = 'datasets/'\n",
"\n",
"for year in range(2012, 2023):\n",
" file_name = f'sid_dav_verkehrszaehlung_miv_OD2031_{year}.csv'\n",
" file_path = os.path.join(data_dir, file_name)\n",
@ -701,7 +715,7 @@
"else:\n",
" print(f\"Failed to download data. Status code: {response.status_code}\")\n",
"\n",
"accidents_file_path = os.path.join(data_dir, ')\n"
"accidents_file_path = os.path.join(data_dir)\n"
]
}
],

View File

@ -1,29 +1,14 @@
# Databases Project
# Database Project Group 1
Use this repository for your integration code and any source code created while working on your project (ad-hoc code,
SQL queries, project files needed by external tools, etc.).
## Preliminaries
* Ensure you have access to a running postgres instance
* Ensure you have ```python3``` and ```pip``` installed.
* From within the root of the project, run ```pip install -r requirements.txt```. This insures all python dependecies are met.
* In ```src/fill_db.py``` look for the ```db_info``` variable and adapt it to your credentials.
- Merge your code into the main branch on the due date.
- Do not commit datasets!
- Any other document (except for the dump in the final hand-in) should be handed-in via ADAM.
If you have any questions regarding the project, please do not hesitate to ask during the exercise lessons or via mail
to [raphael.waltenspuel@unibas.ch](mailto:raphael.waltenspuel@unibas.ch)!
It is recommended that you first create a ```.gitignore``` file. (And exclude the "datasets" folder, for example). A useful tool for creating ```.gitignore``` files is www.gitignore.io.
Feel free to update or replace this readme with a brief description of your project and goals.
### Database setup guide
1. Make sure all the requirements in requirements.txt are met. If they are not
met, run pip install -r requirements.txt in the root of the project.
2. Run the python script integrate.py in the src folder. Set all booleans to
False in the main methode of the script. If the datasets have already been
downloaded, set all the booleans to True. The datasets need to be in a
folder named datasets in src (this should be set up automatically by the
script).
3. Ensure you have a running Postgres instance with a database.
4. Ensure you have the correct credentials in the python script fill_db.py in
dbinfo
5. Run fill_db.py
## Action
In the following the order matters.
1. Run ```unsure_dirs_exist.py```. This makes sure all the directories needed to perform the data integration and logging exist.
1. Run ```integrate.py```. Adjust the main method to fit your needs. In particular adjust the ```process_all_data()``` method, such that the parameter corresponding to a dataset is ```False``` if the script shall download it form the internet, and ```True``` else. To get geojson data form signaled speed in to city of Zurich uncomment the line in the ``main`` method where you find ```load_tempo_geojson_from_api_to_local()```
2. Run ```fill_db.py```. This will load the data into the database based on the credentials given in the ``db_info`` variable.
3. Perform Analysis.

View File

@ -0,0 +1 @@
https://data.stadt-zuerich.ch/dataset/sid_dav_strassenverkehrsunfallorte/download/RoadTrafficAccidentLocations.json

22
docs/all_csv_urls.txt Normal file
View File

@ -0,0 +1,22 @@
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2012.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2013.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2014.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2015.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2016.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2021.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2022.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2012_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2013_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2014_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2015_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2016_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2017_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2018_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2019_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2020_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2021_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2022_verkehrszaehlungen_werte_fussgaenger_velo.csv

View File

@ -0,0 +1,20 @@
# TODOs
* Write a script that makes tables and inserts the data.
* Find out if data cleaning can be done in python with pandas or if it all must be SQL scipts.
# Project Diary
| Version<br/> 0.00 | Author: <br />michel.romancuk@stud.unibas.ch<br />sebastian.lenzlinger@unibas.ch<br /> | HS 2023<br />Databases<br /> |
|-----------------------|-------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|
| Date | | Problems |
| Oktober/ pre 16.11.23 | Decision to use postrges server. <br/> Server setup.<br/> Setup pgadmin at [pgadmin.slenzlinger.dev](pgadmin.slenzlinger.dev) | |
| 16.11.23 | Setup Repo and written some instructions into ``wiki.md`` on how to setup environment. | Realize steps not written down how postgres, pgadmin, nginx etc. was setup at the time. |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | |
| |

View File

@ -0,0 +1,11 @@
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2012_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2013_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2014_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2015_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2016_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2017_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2018_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2019_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2020_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2021_verkehrszaehlungen_werte_fussgaenger_velo.csv
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2022_verkehrszaehlungen_werte_fussgaenger_velo.csv

View File

@ -0,0 +1,11 @@
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2012.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2013.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2014.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2015.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2016.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2021.csv
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2022.csv

View File

@ -0,0 +1,25 @@
# Setup Of Virtual Python dev env
First open the terminal and make sure to be in the root directory.
All steps assume one is in the root folder.
## Create Virtual environment
```
python3 -m venv db23-project-venv
```
## Activating the virtual environment
```
source db23/bin/activate
```
#### When in the environment ``db23-project`` just install all needed packages.
```
pip3 install pkg_name
```
## Getting back out
```
deactivate
```
# List of used packages
See ``requirements.txt``
# Setting up postgres
# Setting up pgadmin as container serverd by nginx

112
requirements.txt Normal file
View File

@ -0,0 +1,112 @@
anyio==4.0.0
appnope==0.1.3
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==23.1.0
Babel==2.13.1
beautifulsoup4==4.12.2
black==23.11.0
bleach==6.1.0
certifi==2023.7.22
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
comm==0.2.0
debugpy==1.8.0
decorator==5.1.1
defusedxml==0.7.1
executing==2.0.1
fastjsonschema==2.19.0
fiona==1.9.5
fqdn==1.5.1
GeoAlchemy2==0.14.2
geopandas==0.14.1
idna==3.4
ipykernel==6.26.0
ipython==8.17.2
ipywidgets==8.1.1
isoduration==20.11.0
jedi==0.19.1
Jinja2==3.1.2
json5==0.9.14
jsonpointer==2.4
jsonschema==4.19.2
jsonschema-specifications==2023.11.1
jupyter==1.0.0
jupyter-console==6.6.3
jupyter-events==0.9.0
jupyter-lsp==2.2.0
jupyter_client==8.6.0
jupyter_core==5.5.0
jupyter_server==2.10.1
jupyter_server_terminals==0.4.4
jupyterlab==4.0.8
jupyterlab-pygments==0.2.2
jupyterlab-widgets==3.0.9
jupyterlab_server==2.25.1
MarkupSafe==2.1.3
matplotlib-inline==0.1.6
mistune==3.0.2
mypy-extensions==1.0.0
nbclient==0.9.0
nbconvert==7.11.0
nbformat==5.9.2
nest-asyncio==1.5.8
notebook==7.0.6
notebook_shim==0.2.3
numpy==1.26.2
overrides==7.4.0
packaging==23.2
pandas==2.1.3
pandocfilters==1.5.0
parso==0.8.3
pathspec==0.11.2
pexpect==4.8.0
platformdirs==4.0.0
prometheus-client==0.18.0
prompt-toolkit==3.0.41
psutil==5.9.6
psycopg2==2.9.9
ptyprocess==0.7.0
pure-eval==0.2.2
pycparser==2.21
Pygments==2.16.1
pyproj==3.6.1
python-dateutil==2.8.2
python-json-logger==2.0.7
pytz==2023.3.post1
PyYAML==6.0.1
pyzmq==25.1.1
qtconsole==5.5.1
QtPy==2.4.1
referencing==0.31.0
requests==2.31.0
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rpds-py==0.13.0
Send2Trash==1.8.2
shapely==2.0.2
six==1.16.0
sniffio==1.3.0
soupsieve==2.5
SQLAlchemy==2.0.23
stack-data==0.6.3
terminado==0.18.0
tinycss2==1.2.1
tornado==6.3.3
traitlets==5.13.0
types-python-dateutil==2.8.19.14
typing_extensions==4.8.0
tzdata==2023.3
uri-template==1.3.0
urllib3==2.1.0
wcwidth==0.2.10
webcolors==1.13
webencodings==0.5.1
websocket-client==1.6.4
widgetsnbextension==4.0.9

139
src/data_utils.py Normal file
View File

@ -0,0 +1,139 @@
import json
import os
import pandas as pd
import requests
from urllib.parse import urlparse
import geopandas as gpd
from concurrent.futures import ThreadPoolExecutor as tpe
import logging
logging.basicConfig(level=logging.DEBUG, filename='logs/data_utils.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('data_utils.py')
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
def download_csv(url, local_filename):
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
def process_urls(data_dir, urls_file):
# Ensure the data directory exists
if not os.path.exists(data_dir):
os.makedirs(data_dir)
# Read URLs from the file
with open(urls_file, 'r') as file:
urls = file.readlines()
# Process each URL
for url in urls:
url = url.strip()
filename = os.path.basename(urlparse(url).path)
local_filename = os.path.join(data_dir, filename)
# Check if the file already exists
if not os.path.isfile(local_filename):
logger.debug(f"Downloading {url}...")
download_csv(url, local_filename)
logger.debug(f"Saved to {local_filename}")
else:
print(f"File {filename} already exists in {data_dir}, skipping download.")
def load_dataframe_from_csv(filepath):
try:
df = pd.read_csv(filepath, low_memory=False)
return df
except Exception as e:
logger.error(f"Error loading {filepath}: {e}")
return None
def load_dataframes_from_csv_files(data_dir, u_string):
dataframes = []
# with tpe(max_workers=5) as executor:
# for filename in os.listdir(data_dir):
# if (u_string in filename) and filename.endswith('.csv'):
# filepath = os.path.join(data_dir, filename)
# future = executor.submit(load_dataframe_from_csv, filepath)
# dataframes.append(future)
#
# dataframes = [future.result() for future in dataframes if future.result() is not None]
#
# return dataframes
for filename in os.listdir(data_dir):
if (u_string in filename) and filename.endswith('.csv'):
filepath = os.path.join(data_dir, filename)
df = pd.read_csv(filepath, low_memory=False)
logger.debug(f'Duplicate Rows for {filename}: {df[df.duplicated()].shape[0]}')
df = df.drop_duplicates()
logger.debug(f'Duplicate Rows after DROPPING for {filename}: {df[df.duplicated()].shape[0]}')
dataframes.append(df.drop_duplicates())
return dataframes
def load_dataframes_from_geojson_files(data_dir, u_string):
print('u_string', u_string)
gdf = gpd.GeoDataFrame()
for filename in os.listdir(data_dir):
#print("Filename:", filename)
if (u_string in filename) and filename.endswith('.json'):
filepath = os.path.join(data_dir, filename)
print("Filepath:", filepath)
gdf = gpd.read_file(filepath)
return gdf
def combine_dataframes(dataframes):
if dataframes:
combined_dataframe = pd.concat(dataframes, ignore_index=True)
logger.debug(f'Duplicate Rows after combining: {combined_dataframe[combined_dataframe.duplicated()]}')
return combined_dataframe
else:
print("No dataframes to combine")
return pd.DataFrame()
def create_unified_df(urls_file, u_string, data_dir, files_present=False):
df_list = []
df_unified = None
if not files_present:
process_urls(data_dir, urls_file)
df_list = load_dataframes_from_csv_files(data_dir, u_string)
df_unified = combine_dataframes(df_list)
return df_unified
def load_file_from_api(api_link, target_name, integrated_dir):
response = requests.get(api_link)
final_location = os.path.join(integrated_dir, target_name)
if response.status_code == 200:
logger.info(f"Succesfull get from {api_link}")
data = response.json()
with open(f'{final_location}.geojson', 'w') as file:
json.dump(data, file)
logger.info(f"{api_link} successfully downloaded and saved to {final_location}")
else:
logger.critical(f"Failed to get data. Status Code: {response.status_code}")
def save_dataframe_to_csv(df, integrated_dir, filename):
pass
if __name__ == "__main__":
csv_urls_file = '../docs/all_csv_urls.txt'
datasets_dir = 'datasets/'
output_file = 'column_names.txt'
process_urls(datasets_dir, csv_urls_file)
# extract_column_names(datasets_dir, output_file)

26
src/ensure_dirs_exist.py Normal file
View File

@ -0,0 +1,26 @@
import logging
import os
"""
The functionality of this script has been adapted from data_utils.ensure_dirs_exist().
This needs to be run before any other script.
"""
data_dir = 'datasets/'
integrated_dir = 'datasets/integrated/'
logs_dir = 'logs/'
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('integrate.py')
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}')
logger.info("Ensuring needed directories exist.")
os.makedirs(data_dir, exist_ok=True)
logger.debug("data_dir created.")
os.makedirs(integrated_dir, exist_ok=True)
logger.debug("integrated_dir created")
os.makedirs(logs_dir, exist_ok=True)
logger.debug("logs_dir created")

108
src/fill_db.py Normal file
View File

@ -0,0 +1,108 @@
import logging
import psycopg2
import subprocess
logging.basicConfig(level=logging.DEBUG, filename='logs/fill_db.log',
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('fill_db.py')
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
integrated_dir = 'datasets/integrated/'
accident_geojson_file = 'datasets/integrated/Accidents.geojson'
signaled_speeds_file = 'datasets/integrated/signaled_speeds.geojson.geojson'
accident_loader_script = 'load_accidents_into_db.sh'
accident_table_name = 'accidents'
signaled_speeds_table_name = 'signaled_speeds'
"""
Make sure db_info contain the correct credentials
"""
db_info = {
'host': 'localhost',
'database': 'test-db23',
'port': '5432',
'user': 'seb',
'password': '',
}
setup_tables_script = 'setup_tables.sql'
load_csvs_into_db_script = 'load_csvs_into_db.sql'
def run_sql(script, db_info):
db_connection = psycopg2.connect(**db_info)
db_cursor = db_connection.cursor()
with open(script, 'r') as sql_file:
sql_script = sql_file.read()
try:
db_cursor.execute(sql_script)
db_connection.commit()
logger.info(f'{script} executed successfully')
except Exception as e:
db_connection.rollback()
logger.exception(f'Error executing {sql_script}: {e}')
finally:
db_cursor.close()
db_connection.close()
def run_geojson_loader_script(script, *args):
try:
cmd = ['bash', script] + list(args)
res = subprocess.run(cmd, check=True, text=True, capture_output=True)
logger.info(f'{script} executed successfully. Output: {res.stdout}')
except subprocess.CalledProcessError as e:
logger.exception(f'Error executing {script}: {e}')
logger.info(f"Remember to set the correct permissions for the script: chmod +x {script}")
def geojson_loader(*args, modus='append'):
"""
Use this instead of run_geojson_loader_script() in the main method to avoid the bash dependency.
:param args: All the arguments needed for ogr2org to run properly
:param modus: append or overwrite db table
:return:
"""
geojson_file, db_name, db_user, db_password, db_host, db_port, target_table = args
cmd = [
"ogr2ogr",
"-f", "PostgreSQL",
f"PG:dbname='{db_name}' host='{db_host}' port='{db_port}' user='{db_user}' password='{db_password}'",
geojson_file,
"-nln", target_table,
f"-{modus}"
]
try:
# Run the command
res = subprocess.run(cmd, check=True, text=True, capture_output=True)
logger.info(f"ogr2ogr command executed successfully. Output: {res.stdout}")
except subprocess.CalledProcessError as e:
logger.exception(f"Error executing ogr2ogr command: {e}")
if __name__ == '__main__':
run_sql(setup_tables_script, db_info)
logger.info("Finnished setting up tables.")
run_sql(load_csvs_into_db_script, db_info)
logger.info("Finnished loading csv into db.")
run_geojson_loader_script(accident_loader_script,
accident_geojson_file,
db_info['database'],
db_info['user'],
db_info['password'],
db_info['host'],
db_info['port'],
accident_table_name)
logger.info('Finished loading accident geojson into db using bash script.')
geojson_loader(signaled_speeds_file,
db_info['database'],
db_info['user'],
db_info['password'],
db_info['host'],
db_info['port'],
signaled_speeds_table_name,
modus='overwrite')

245
src/integrate.py Normal file
View File

@ -0,0 +1,245 @@
import data_utils as du
import os
import pandas as pd
import geopandas as gpd
import time
from shapely.geometry import Point
import re
import logging
logging.basicConfig(level=logging.DEBUG, filename='logs/integrate.log',
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('integrate.py')
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt'
miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt'
accident_file_url = '../docs/accident_loc_urls.txt'
# Using u_string to discriminate between files that belong to each other
motor_file_u_string = 'sid_dav_verkehrszaehlung_miv_OD2031'
foot_bike_file_u_string = 'velo.csv'
accident_file_u_string = 'RoadTrafficAccidentLocations.json'
data_dir = 'datasets/'
integrated_dir = 'datasets/integrated/'
logs_dir = 'logs/'
signaled_speeds_json_api = 'https://www.ogd.stadt-zuerich.ch/wfs/geoportal/Signalisierte_Geschwindigkeiten?service=WFS&version=1.1.0&request=GetFeature&outputFormat=GeoJSON&typename=view_geoserver_tempo_ist'
weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
fb_data_types = {
'ID': 'int',
'NORD': 'int',
'OST': 'int',
'DATE': 'str',
'HRS': 'int',
'VELO_IN': 'int',
'VELO_OUT': 'int',
'FUSS_IN': 'int',
'FUSS_OUT': 'int',
'Weekday_en': 'str'
}
miv_data_types = {
'ID': 'int',
'MSID': 'str',
'ZSID': 'str',
'Achse': 'str',
'NKoord': 'int',
'EKoord': 'int',
'Richtung': 'str',
'AnzFahrzeuge': 'int',
'AnzFahrzeugeStatus': 'str',
'Datum': 'str',
'Hrs': 'int',
'Weekday_en': 'str',
}
acc_data_types = {
'AccidentUID': 'str',
'AccidentYear': 'int',
'AccidentMonth': 'int',
'AccidentWeekDay_en': 'str',
'AccidentHour': 'int',
'NKoord': 'int',
'EKoord': 'int',
'AccidentType_en': 'str',
'AccidentType': 'str',
'AccidentSeverityCategory': 'str',
'AccidentInvolvingPedestrian': 'bool',
'AccidentInvolvingBicycle': 'bool',
'AccidentInvolvingMotorcycle': 'bool',
'RoadType': 'str',
'RoadType_en': 'str',
'geometry': 'str' # TODO: Figure out what dtype this needs to be for postgres
}
# def ensure_dirs_exist(data_dir, integrated_dir, logs_dir):
# """
# This should be called before anything else to make sure that the relevant directories exists.
# :param data_dir: directory where the datasets are stored
# :param integrated_dir: directory where the integrated data will be stored
# :return:
# """
# logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}')
# logger.info("Ensuring needed directories exist.")
# os.makedirs(data_dir, exist_ok=True)
# logger.debug("data_dir created.")
# os.makedirs(integrated_dir, exist_ok=True)
# logger.debug("integrated_dir created")
# os.makedirs(logs_dir, exist_ok=True)
# logger.debug("logs_dir created")
def process_foot_bike_data(files_present=True):
fb_df_unified = du.create_unified_df(foot_bike_urls_file, foot_bike_file_u_string, data_dir,
files_present=files_present)
fb_df_unified[['DATE', "TIME"]] = fb_df_unified['DATUM'].str.split('T', expand=True)
fb_df_unified[['HRS', 'MINS']] = fb_df_unified['TIME'].str.split(':', expand=True)
## Evt brauchen wir doch FK_ZAEHLER
fb_cols_to_drop = ['DATUM']
fb_df_unified_correct_cols = fb_df_unified.drop(columns=fb_cols_to_drop, axis=1)
fb_df_unified_correct_cols.fillna(0, inplace=True)
fb_df_grouped = fb_df_unified_correct_cols.groupby(['OST', 'NORD', 'DATE', 'HRS']).agg({
'VELO_IN': 'sum',
'VELO_OUT': 'sum',
'FUSS_IN': 'sum',
'FUSS_OUT': 'sum'
}).reset_index()
dt_obj = pd.to_datetime(fb_df_grouped['DATE'])
days = dt_obj.dt.weekday
fb_df_grouped['Weekday_en'] = days.map(lambda x: weekday_names[x])
cleaned_fb_df = fb_df_grouped
cleaned_fb_df['ID'] = cleaned_fb_df.index + 1
cleaned_fb_df = cleaned_fb_df[['ID', 'NORD', 'OST', 'DATE', 'HRS', 'VELO_IN', 'VELO_OUT', 'FUSS_IN',
'FUSS_OUT', 'Weekday_en']]
# Ensure datatype of df and sql table match
cleaned_fb_df = cleaned_fb_df.astype(fb_data_types)
return cleaned_fb_df
def process_miv_data(files_present=True):
miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir, files_present=files_present)
miv_df_unified[['Datum', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True)
miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True)
miv_cols_to_keep = ['MSID','ZSID','Achse', 'NKoord', 'EKoord', 'Richtung', 'AnzFahrzeuge', 'AnzFahrzeugeStatus',
'Datum', 'Hrs',]
miv_df_cols_dropped = miv_df_unified[miv_cols_to_keep]
dt_obj = pd.to_datetime(miv_df_cols_dropped['Datum'])
days = dt_obj.dt.weekday
miv_df_cols_dropped.loc[:, 'Weekday_en'] = days.map(lambda x: weekday_names[x])
miv_df_cols_dropped.loc[:, 'AnzFahrzeuge'] = miv_df_cols_dropped['AnzFahrzeuge'].fillna(0).astype(int)
miv_df_cols_dropped[:, 'ZSID'] = miv_df_cols_dropped['ZSID'].fillna('Missing').astype(str)
miv_df_cols_dropped['ID'] = (miv_df_cols_dropped.index + 1).copy()
cleaned_miv_df = miv_df_cols_dropped[['ID', 'MSID', 'ZSID', 'Achse', 'NKoord', 'EKoord', 'Richtung', 'AnzFahrzeuge',
'AnzFahrzeugeStatus', 'Datum', 'Hrs', 'Weekday_en']]
cleaned_miv_df = cleaned_miv_df.astype(miv_data_types)
cleaned_miv_df = cleaned_miv_df.drop_duplicates()
return cleaned_miv_df
def process_accident_data(file_present: bool = True):
if not file_present:
du.process_urls(data_dir, accident_file_url)
acc_df_unified = du.load_dataframes_from_geojson_files(data_dir, accident_file_u_string)
acc_cols_to_keep = ['AccidentUID', 'AccidentYear', 'AccidentMonth', 'AccidentWeekDay_en','AccidentHour',
'AccidentLocation_CHLV95_N', 'AccidentLocation_CHLV95_E', 'AccidentType_en', 'AccidentType',
'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en',
'geometry']
cleaned_acc_df = acc_df_unified[acc_cols_to_keep]
cleaned_acc_df.rename(columns={
'AccidentLocation_CHLV95_E': 'EKoord',
'AccidentLocation_CHLV95_N': 'NKoord',
}, inplace=True)
cleaned_acc_df = cleaned_acc_df.astype(acc_data_types)
return cleaned_acc_df
def process_all_data_sources(fb_present=True, miv_present=True, accident_present=True):
"""
Process all data sources and turn them in to csv files. After this function is called there
should be csv files of the cleaned and integrated data sources
:param fb_present: bool, if the files present in local file system
:param miv_present: bool, if the files present in local file system
:param accident_present: bool, if the files present in local file system
:return:
"""
# ensure_dirs_exist(data_dir, integrated_dir)
logger.info("Started processing all data sources.")
fb_to_integrated(fb_present)
miv_to_integrated_csv(miv_present)
acc_to_cleaned_geojson(accident_present)
def fb_to_integrated(files_present=True):
start_time = time.time()
logger.info("Start processing pedestrian and bicycle data (FootBikeCount)")
fb_count_df = process_foot_bike_data(files_present)
logger.debug(f'FB Head:{fb_count_df.head()}\n FB dtypes: {fb_count_df.dtypes}')
fb_file_path = os.path.join(integrated_dir, 'FootBikeCount.csv')
logger.debug(f'FB Cleaned File Path: {fb_file_path}')
fb_count_df.to_csv(fb_file_path, index=False)
logger.info("FB integrated csv created.")
end_time = time.time()
logger.info(f'Time taken for FootBikeCount: {end_time-start_time}')
def miv_to_integrated_csv(miv_present=True):
start_time2 = time.time()
logger.info("Start processing motorized vehicle data (MivCount)")
miv_count_df = process_miv_data(miv_present)
logger.debug(f'MIV Head:{miv_count_df.head()}\n MIV dtypes: {miv_count_df.dtypes}')
miv_file_path = os.path.join(integrated_dir, 'MivCount.csv')
logger.debug(f'MIV Cleaned File Path: {miv_file_path}')
miv_count_df.to_csv(miv_file_path, index=False)
logger.info("MIV integrated csv created.")
end_time = time.time()
logger.info(f'Time taken for MivCount: {end_time-start_time2}')
def acc_to_cleaned_geojson(acc_present=True):
start_time3 = time.time()
logger.info("Start processing accident data (Accidents)")
acc_df = process_accident_data(acc_present)
logger.debug(f'ACC Head: { acc_df.head()}\n Acc dtypes: {acc_df.dtypes}')
acc_file_path = os.path.join(integrated_dir, 'Accidents.geojson')
logger.debug(f'Acc Cleaned file path: {acc_file_path}')
acc_df['geometry'] = acc_df['geometry'].apply(lambda row: re.findall(r"[-+]?\d*\.\d+|\d+", row))
# Create a Point object using the extracted coordinates
acc_df['geometry'] = acc_df['geometry'].apply(
lambda coords: Point(float(coords[0]), float(coords[1]), float(coords[2])))
acc_gdf = gpd.GeoDataFrame(acc_df, geometry='geometry')
acc_gdf.to_file(acc_file_path, driver='GeoJSON')
logger.info("ACC integrated csv created.")
end_time = time.time()
logger.info(f'Time taken for Accidents: {end_time - start_time3}')
def load_tempo_geojson_from_api_to_local():
du.load_file_from_api(signaled_speeds_json_api, 'signaled_speeds.geojson', integrated_dir)
if __name__ == '__main__':
# ensure_dirs_exist(data_dir, integrated_dir, logs_dir)
# process_all_data_sources(True, True, True)
# miv_to_integrated_csv()
# acc_to_cleaned_geojson()
load_tempo_geojson_from_api_to_local()

View File

@ -0,0 +1,15 @@
#!/bin/bash
# Define parameters
GEOJSON_FILE=$1
DB_NAME=$2
DB_USER=$3
DB_PASSWORD=$4
DB_HOST=$5
DB_PORT=$6
TARGET_TABLE=$7
# Run ogr2ogr command
ogr2ogr -f "PostgreSQL" PG:"dbname='$DB_NAME' host='$DB_HOST' port='$DB_PORT' user='$DB_USER' password='$DB_PASSWORD'" "$GEOJSON_FILE" -nln $TARGET_TABLE -append
echo "GeoJSON data has been imported into $TARGET_TABLE"

View File

@ -0,0 +1,7 @@
COPY FootBikeCount FROM '/Users/seb/Projects/repos/group-1/src/datasets/integrated/FootBikeCount.csv'
DELIMITER ','
CSV HEADER;
COPY MivCount FROM '/Users/seb/Projects/repos/group-1/src/datasets/integrated/MivCount.csv'
DELIMITER ','
CSV HEADER;

42
src/queries.sql Normal file
View File

@ -0,0 +1,42 @@
select p.id, a.accidentuid, m.id
from footbikecount p, accidents a, mivcount m
where p.weekday_en = a.accidentweekday_en AND a.accidentweekday_en = m.weekday_en
AND p.weekday_en = m.weekday_en AND p.hrs = a.accidenthour AND a.accidenthour = m.hrs
AND p.hrs = m.hrs AND (p.ost - m.ekoord between -100 AND 100) AND (p.nord - m.nkoord between -100 AND 100);
DROP TABLE IF EXISTS Contemporaneous2;
CREATE TABLE Contemporaneous2 (
p_id INTEGER,
accidentuid VARCHAR(256),
m_id INTEGER,
weekday_en VARCHAR(10),
hrs INTEGER,
distance DOUBLE PRECISION
);
CREATE TABLE Intermediate2 AS
SELECT
p.id AS p_id,
a.accidentuid,
m.id AS m_id,
p.weekday_en,
p.hrs,
SQRT(POWER(p.ost - m.ekoord, 2) + POWER(p.nord - m.nkoord, 2)) AS distance
FROM
footbikecount p,
accidents a,
mivcount m
WHERE
p.weekday_en = a.accidentweekday_en
AND a.accidentweekday_en = m.weekday_en
AND p.weekday_en = m.weekday_en
AND p.hrs = a.accidenthour
AND a.accidenthour = m.hrs
AND p.hrs = m.hrs
AND (p.ost - m.ekoord BETWEEN -100 AND 100)
AND (p.nord - m.nkoord BETWEEN -100 AND 100);
INSERT INTO Contemporaneous2 (p_id, accidentuid, m_id, weekday_en, hrs, distance)
SELECT p_id, accidentuid, m_id, weekday_en, hrs, distance FROM Intermediate2;

72
src/setup_tables.sql Normal file
View File

@ -0,0 +1,72 @@
CREATE EXTENSION IF NOT EXISTS postgis;
DROP TABLE IF EXISTS FootBikeCount;
DROP TABLE IF EXISTS Accidents;
DROP TABLE IF EXISTS MivCount;
CREATE TABLE FootBikeCount (
ID INTEGER ,
NORD INTEGER ,
OST INT ,
DATE VARCHAR(10) ,
HRS INTEGER ,
VELO_IN INTEGER ,
VELO_OUT INTEGER ,
FUSS_IN INTEGER ,
FUSS_OUT INTEGER ,
Weekday_en VARCHAR(10) ,
PRIMARY KEY(ID) ,
CHECK (Weekday_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')),
CHECK (Hrs BETWEEN 0 AND 23)
);
CREATE TABLE MivCount (
ID INTEGER ,
MSID VARCHAR(10) ,
ZSID VARCHAR(10) ,
Achse VARCHAR(256) ,
NKoord INTEGER ,
EKoord INTEGER ,
Richtung VARCHAR(100) ,
AnzFahrzeuge INTEGER ,
AnzFahrzeugeStatus VARCHAR(20) ,
Datum VARCHAR(10) ,
Hrs Integer ,
Weekday_en VARCHAR(10),
PRIMARY KEY (ID),
CHECK (Weekday_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')),
CHECK (Hrs BETWEEN 0 AND 23)
);
CREATE TABLE Accidents (
AccidentUID VARCHAR(256) ,
AccidentYear INTEGER ,
AccidentMonth INTEGER,
AccidentWeekDay_en VARCHAR(10) ,
AccidentHour INTEGER ,
NKoord INTEGER ,
EKoord INTEGER ,
AccidentType_en VARCHAR(256) ,
AccidentType VARCHAR(4) ,
AccidentSeverityCategory VARCHAR(4) ,
AccidentInvolvingPedestrian BOOLEAN ,
AccidentInvolvingBicycle BOOLEAN ,
AccidentInvolvingMotorcycle BOOLEAN ,
RoadType VARCHAR(5) ,
RoadType_en VARCHAR(256) ,
Geometry geometry(Point, 4326) ,
PRIMARY KEY (AccidentUID) ,
CHECK ( AccidentHour BETWEEN 0 AND 23) ,
CHECK (AccidentWeekDay_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))
);

233
src/testArea.ipynb Normal file
View File

@ -0,0 +1,233 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import pandas as pd\n",
"from datetime import datetime as dt\n",
"\n",
"import integrate as intr\n",
"\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-03T15:47:10.110909Z",
"start_time": "2023-12-03T15:47:09.656556Z"
}
},
"id": "be55b25929d95559"
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/seb/Projects/repos/group-1/src/integrate.py:132: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" miv_df_cols_dropped['Weekday_en'] = days.map(lambda x: weekday_names[x])\n",
"/Users/seb/Projects/repos/group-1/src/integrate.py:133: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" miv_df_cols_dropped['AnzFahrzeuge'] = miv_df_cols_dropped['AnzFahrzeuge'].fillna(0).astype(int)\n"
]
}
],
"source": [
"\n",
"miv_df = intr.process_miv_data()\n",
"#fb_data = intr.process_foot_bike_data()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-03T15:49:07.561603Z",
"start_time": "2023-12-03T15:47:14.759104Z"
}
},
"id": "dd3831953afdeb72"
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"121\n"
]
}
],
"source": [
"duplicate_rows = miv_df[miv_df.duplicated()]\n",
"print(duplicate_rows.shape[0])"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-03T15:51:21.158909Z",
"start_time": "2023-12-03T15:51:15.711222Z"
}
},
"id": "14471cd78389ce4d"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"acc_df = intr.process_accident_data(True)"
],
"metadata": {
"collapsed": false
},
"id": "f86bc612060b17a4"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"acc_df.head()\n",
"acc_df['AccidentWeekDay_en'].unique()\n",
"#acc_df.dtypes\n",
"\n"
],
"metadata": {
"collapsed": false
},
"id": "6affbeea6c7cf3ef"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"print(\"Accident Columns:\")\n",
"print(acc_df.dtypes)\n",
"print()\n",
"print(\"MIV Columns:\")\n",
"print(miv_df.dtypes)\n",
"print()\n",
"print(\"FB Cols:\")\n",
"print(fb_data.dtypes)"
],
"metadata": {
"collapsed": false
},
"id": "242041cd369d8454"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"acc_df['ID'] = acc_df.index +1\n",
"acc_df[('ID')]"
],
"metadata": {
"collapsed": false
},
"id": "1841925ee109a417"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"print(\"MIV unqiue:\", miv_df['EKoord'])\n",
"print(\"Acc unique:\", acc_df['RoadType'].unique)\n",
"print(\"FB unique: \", fb_data['DATE'])\n"
],
"metadata": {
"collapsed": false
},
"id": "f6d752ea17eda341"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"acc_df.head()"
],
"metadata": {
"collapsed": false
},
"id": "a159cafa9c227b88"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sqlalchemy import create_engine\n",
"from geoalchemy2 import Geometry, WKTElement\n",
"import geopandas as gpd\n",
"from shapely import wkt\n",
"\n",
"db_url = f'postgresql://seb:@localhost:5432/test-db23'\n",
"engine = create_engine(db_url)\n",
"\n",
"#miv_df.to_sql('table_name', engine, if_exists='replace', index=False)\n",
"#fb_data.to_sql('footbike', engine, if_exists='replace', index=False)\n",
"\n",
"geometry_column = 'geometry'\n",
"\n",
"\n",
"acc_df['geometry'] = acc_df['geometry'].apply(lambda geom: WKTElement(geom.wkt, srid=4326))\n",
"\n",
"acc_df.to_sql('accidents', engine, if_exists='replace', index=False, dtype={'geometry': Geometry('POINT', srid=4326)})\n",
"\n"
],
"metadata": {
"collapsed": false
},
"id": "fa76af8343443d7a"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"engine.dispose()"
],
"metadata": {
"collapsed": false
},
"id": "bc0a23a5126e76c2"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}