Merge branch 'dev-explore' into 'main'
Milestone Merge See merge request dbis/lecture-groups/database-systems/2023hs/group-1!3
This commit is contained in:
commit
8be3279ace
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,6 +1,7 @@
|
||||
# Created by https://www.toptal.com/developers/gitignore/api/python,pycharm,linux,macos,database,data
|
||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm,linux,macos,database,data
|
||||
datasets/
|
||||
db23-project-venv/
|
||||
### Data ###
|
||||
*.csv
|
||||
*.dat
|
||||
|
||||
@ -2,14 +2,18 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 1,
|
||||
"id": "17ca2acb",
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
},
|
||||
"tags": []
|
||||
"tags": [],
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-11-16T16:20:49.426349Z",
|
||||
"start_time": "2023-11-16T16:20:16.117316Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -156,7 +160,11 @@
|
||||
"2 Unbekannt 2016-01-01T02:00:00 2021-02-03 213.0 Gemessen \n",
|
||||
"3 Unbekannt 2016-01-01T03:00:00 2021-02-03 112.0 Gemessen \n",
|
||||
"4 Unbekannt 2016-01-01T04:00:00 2021-02-03 80.0 Gemessen \n",
|
||||
"Data for year 2017:\n",
|
||||
"File not found for year 2017: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv\n",
|
||||
"File not found for year 2018: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv\n",
|
||||
"File not found for year 2019: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv\n",
|
||||
"File not found for year 2020: datasets/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv\n",
|
||||
"Data for year 2021:\n",
|
||||
" MSID MSName ZSID ZSName Achse \\\n",
|
||||
"0 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
|
||||
"1 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
|
||||
@ -179,34 +187,39 @@
|
||||
"4 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
|
||||
"\n",
|
||||
" D4ID MessungDatZeit LieferDat AnzFahrzeuge AnzFahrzeugeStatus \n",
|
||||
"0 Unbekannt 2017-01-01T00:00:00 2021-02-03 295.0 Gemessen \n",
|
||||
"1 Unbekannt 2017-01-01T01:00:00 2021-02-03 264.0 Gemessen \n",
|
||||
"2 Unbekannt 2017-01-01T02:00:00 2021-02-03 180.0 Gemessen \n",
|
||||
"3 Unbekannt 2017-01-01T03:00:00 2021-02-03 107.0 Gemessen \n",
|
||||
"4 Unbekannt 2017-01-01T04:00:00 2021-02-03 97.0 Gemessen \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m/tmp/ipykernel_311061/2135127822.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlow_memory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mtable_key\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"miv_{year}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 946\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 947\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 948\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 950\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 615\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 617\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 618\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1746\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1747\u001b[0m \u001b[0mcol_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1748\u001b[0;31m \u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1749\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1750\u001b[0m )\n",
|
||||
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 239\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 240\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32mparsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers._maybe_upcast\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/numpy/core/multiarray.py\u001b[0m in \u001b[0;36mputmask\u001b[0;34m(a, mask, values)\u001b[0m\n\u001b[1;32m 1129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1130\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1131\u001b[0;31m \u001b[0;34m@\u001b[0m\u001b[0marray_function_from_c_func_and_dispatcher\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_multiarray_umath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mputmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1132\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mputmask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m/\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1133\u001b[0m \"\"\"\n",
|
||||
"\u001b[0;32m_pydevd_bundle/pydevd_cython.pyx\u001b[0m in \u001b[0;36m_pydevd_bundle.pydevd_cython.SafeCallWrapper.__call__\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32m_pydevd_bundle/pydevd_cython.pyx\u001b[0m in \u001b[0;36m_pydevd_bundle.pydevd_cython.ThreadTracer.__call__\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/debugpy/_vendored/pydevd/_pydev_bundle/pydev_is_thread_alive.py\u001b[0m in \u001b[0;36mis_thread_alive\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_temp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'_is_stopped'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Python 3.x has this\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mis_thread_alive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_stopped\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
||||
"0 Unbekannt 2021-01-01T00:00:00 2021-12-30 122.0 Gemessen \n",
|
||||
"1 Unbekannt 2021-01-01T01:00:00 2021-12-30 177.0 Gemessen \n",
|
||||
"2 Unbekannt 2021-01-01T02:00:00 2021-12-30 125.0 Gemessen \n",
|
||||
"3 Unbekannt 2021-01-01T03:00:00 2021-12-30 84.0 Gemessen \n",
|
||||
"4 Unbekannt 2021-01-01T04:00:00 2021-12-30 49.0 Gemessen \n",
|
||||
"Data for year 2022:\n",
|
||||
" MSID MSName ZSID ZSName Achse \\\n",
|
||||
"0 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
|
||||
"1 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
|
||||
"2 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
|
||||
"3 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
|
||||
"4 Z001M001 Unbekannt Z001 Seestrasse (Strandbad Wollishofen) Seestrasse \n",
|
||||
"\n",
|
||||
" HNr Hoehe EKoord NKoord Richtung Knummer \\\n",
|
||||
"0 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n",
|
||||
"1 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n",
|
||||
"2 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n",
|
||||
"3 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n",
|
||||
"4 451 Unbekannt 2683009.89 1243936.2 auswärts 789 \n",
|
||||
"\n",
|
||||
" Kname AnzDetektoren D1ID D2ID D3ID \\\n",
|
||||
"0 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
|
||||
"1 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
|
||||
"2 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
|
||||
"3 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
|
||||
"4 Badanstalt Wollishofen 1 2 Unbekannt Unbekannt \n",
|
||||
"\n",
|
||||
" D4ID MessungDatZeit LieferDat AnzFahrzeuge AnzFahrzeugeStatus \n",
|
||||
"0 Unbekannt 2022-01-01T00:00:00 2022-12-30 166.0 Gemessen \n",
|
||||
"1 Unbekannt 2022-01-01T01:00:00 2022-12-30 255.0 Gemessen \n",
|
||||
"2 Unbekannt 2022-01-01T02:00:00 2022-12-30 168.0 Gemessen \n",
|
||||
"3 Unbekannt 2022-01-01T03:00:00 2022-12-30 96.0 Gemessen \n",
|
||||
"4 Unbekannt 2022-01-01T04:00:00 2022-12-30 63.0 Gemessen \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -219,7 +232,8 @@
|
||||
"\n",
|
||||
"tables_dict = {}\n",
|
||||
"pd.set_option(\"display.max_columns\", None)\n",
|
||||
"data_dir = 'data/'\n",
|
||||
"data_dir = 'datasets/'\n",
|
||||
"\n",
|
||||
"for year in range(2012, 2023):\n",
|
||||
" file_name = f'sid_dav_verkehrszaehlung_miv_OD2031_{year}.csv'\n",
|
||||
" file_path = os.path.join(data_dir, file_name)\n",
|
||||
@ -701,7 +715,7 @@
|
||||
"else:\n",
|
||||
" print(f\"Failed to download data. Status code: {response.status_code}\")\n",
|
||||
"\n",
|
||||
"accidents_file_path = os.path.join(data_dir, ')\n"
|
||||
"accidents_file_path = os.path.join(data_dir)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
39
README.md
39
README.md
@ -1,29 +1,14 @@
|
||||
# Databases Project
|
||||
# Database Project Group 1
|
||||
|
||||
Use this repository for your integration code and any source code created while working on your project (ad-hoc code,
|
||||
SQL queries, project files needed by external tools, etc.).
|
||||
## Preliminaries
|
||||
* Ensure you have access to a running postgres instance
|
||||
* Ensure you have ```python3``` and ```pip``` installed.
|
||||
* From within the root of the project, run ```pip install -r requirements.txt```. This insures all python dependecies are met.
|
||||
* In ```src/fill_db.py``` look for the ```db_info``` variable and adapt it to your credentials.
|
||||
|
||||
- Merge your code into the main branch on the due date.
|
||||
- Do not commit datasets!
|
||||
- Any other document (except for the dump in the final hand-in) should be handed-in via ADAM.
|
||||
|
||||
If you have any questions regarding the project, please do not hesitate to ask during the exercise lessons or via mail
|
||||
to [raphael.waltenspuel@unibas.ch](mailto:raphael.waltenspuel@unibas.ch)!
|
||||
|
||||
It is recommended that you first create a ```.gitignore``` file. (And exclude the "datasets" folder, for example). A useful tool for creating ```.gitignore``` files is www.gitignore.io.
|
||||
|
||||
Feel free to update or replace this readme with a brief description of your project and goals.
|
||||
|
||||
### Database setup guide
|
||||
|
||||
1. Make sure all the requirements in ’requirements.txt’ are met. If they are not
|
||||
met, run pip install -r requirements.txt in the root of the project.
|
||||
2. Run the python script ’integrate.py’ in the ’src’ folder. Set all booleans to
|
||||
’False’ in the main methode of the script. If the datasets have already been
|
||||
downloaded, set all the booleans to ’True’. The datasets need to be in a
|
||||
folder named ’datasets’ in ’src’ (this should be set up automatically by the
|
||||
script).
|
||||
3. Ensure you have a running Postgres instance with a database.
|
||||
4. Ensure you have the correct credentials in the python script ’fill_db.py’ in
|
||||
’dbinfo’
|
||||
5. Run ’fill_db.py’
|
||||
## Action
|
||||
In the following the order matters.
|
||||
1. Run ```unsure_dirs_exist.py```. This makes sure all the directories needed to perform the data integration and logging exist.
|
||||
1. Run ```integrate.py```. Adjust the main method to fit your needs. In particular adjust the ```process_all_data()``` method, such that the parameter corresponding to a dataset is ```False``` if the script shall download it form the internet, and ```True``` else. To get geojson data form signaled speed in to city of Zurich uncomment the line in the ``main`` method where you find ```load_tempo_geojson_from_api_to_local()```
|
||||
2. Run ```fill_db.py```. This will load the data into the database based on the credentials given in the ``db_info`` variable.
|
||||
3. Perform Analysis.
|
||||
1
docs/accident_loc_urls.txt
Normal file
1
docs/accident_loc_urls.txt
Normal file
@ -0,0 +1 @@
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_strassenverkehrsunfallorte/download/RoadTrafficAccidentLocations.json
|
||||
22
docs/all_csv_urls.txt
Normal file
22
docs/all_csv_urls.txt
Normal file
@ -0,0 +1,22 @@
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2012.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2013.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2014.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2015.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2016.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2021.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2022.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2012_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2013_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2014_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2015_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2016_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2017_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2018_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2019_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2020_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2021_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2022_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
@ -0,0 +1,20 @@
|
||||
# TODOs
|
||||
* Write a script that makes tables and inserts the data.
|
||||
* Find out if data cleaning can be done in python with pandas or if it all must be SQL scipts.
|
||||
|
||||
# Project Diary
|
||||
|
||||
| Version<br/> 0.00 | Author: <br />michel.romancuk@stud.unibas.ch<br />sebastian.lenzlinger@unibas.ch<br /> | HS 2023<br />Databases<br /> |
|
||||
|-----------------------|-------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|
|
||||
| Date | | Problems |
|
||||
| Oktober/ pre 16.11.23 | Decision to use postrges server. <br/> Server setup.<br/> Setup pgadmin at [pgadmin.slenzlinger.dev](pgadmin.slenzlinger.dev) | |
|
||||
| 16.11.23 | Setup Repo and written some instructions into ``wiki.md`` on how to setup environment. | Realize steps not written down how postgres, pgadmin, nginx etc. was setup at the time. |
|
||||
| | | |
|
||||
| | | |
|
||||
| | | |
|
||||
| | | |
|
||||
| | | |
|
||||
| | | |
|
||||
| | | |
|
||||
| | | |
|
||||
| |
|
||||
11
docs/foot_bike_zaehlung_urls.txt
Normal file
11
docs/foot_bike_zaehlung_urls.txt
Normal file
@ -0,0 +1,11 @@
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2012_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2013_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2014_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2015_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2016_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2017_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2018_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2019_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2020_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2021_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
https://data.stadt-zuerich.ch/dataset/ted_taz_verkehrszaehlungen_werte_fussgaenger_velo/download/2022_verkehrszaehlungen_werte_fussgaenger_velo.csv
|
||||
11
docs/verkehrszaehlung_moto_urls.txt
Normal file
11
docs/verkehrszaehlung_moto_urls.txt
Normal file
@ -0,0 +1,11 @@
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2012.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2013.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2014.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2015.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2016.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2017.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2018.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2019.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2020.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2021.csv
|
||||
https://data.stadt-zuerich.ch/dataset/sid_dav_verkehrszaehlung_miv_od2031/download/sid_dav_verkehrszaehlung_miv_OD2031_2022.csv
|
||||
25
docs/wiki.md
25
docs/wiki.md
@ -0,0 +1,25 @@
|
||||
# Setup Of Virtual Python dev env
|
||||
First open the terminal and make sure to be in the root directory.
|
||||
All steps assume one is in the root folder.
|
||||
## Create Virtual environment
|
||||
```
|
||||
python3 -m venv db23-project-venv
|
||||
```
|
||||
## Activating the virtual environment
|
||||
```
|
||||
source db23/bin/activate
|
||||
```
|
||||
#### When in the environment ``db23-project`` just install all needed packages.
|
||||
```
|
||||
pip3 install pkg_name
|
||||
```
|
||||
## Getting back out
|
||||
```
|
||||
deactivate
|
||||
```
|
||||
|
||||
# List of used packages
|
||||
See ``requirements.txt``
|
||||
|
||||
# Setting up postgres
|
||||
# Setting up pgadmin as container serverd by nginx
|
||||
112
requirements.txt
Normal file
112
requirements.txt
Normal file
@ -0,0 +1,112 @@
|
||||
anyio==4.0.0
|
||||
appnope==0.1.3
|
||||
argon2-cffi==23.1.0
|
||||
argon2-cffi-bindings==21.2.0
|
||||
arrow==1.3.0
|
||||
asttokens==2.4.1
|
||||
async-lru==2.0.4
|
||||
attrs==23.1.0
|
||||
Babel==2.13.1
|
||||
beautifulsoup4==4.12.2
|
||||
black==23.11.0
|
||||
bleach==6.1.0
|
||||
certifi==2023.7.22
|
||||
cffi==1.16.0
|
||||
charset-normalizer==3.3.2
|
||||
click==8.1.7
|
||||
click-plugins==1.1.1
|
||||
cligj==0.7.2
|
||||
comm==0.2.0
|
||||
debugpy==1.8.0
|
||||
decorator==5.1.1
|
||||
defusedxml==0.7.1
|
||||
executing==2.0.1
|
||||
fastjsonschema==2.19.0
|
||||
fiona==1.9.5
|
||||
fqdn==1.5.1
|
||||
GeoAlchemy2==0.14.2
|
||||
geopandas==0.14.1
|
||||
idna==3.4
|
||||
ipykernel==6.26.0
|
||||
ipython==8.17.2
|
||||
ipywidgets==8.1.1
|
||||
isoduration==20.11.0
|
||||
jedi==0.19.1
|
||||
Jinja2==3.1.2
|
||||
json5==0.9.14
|
||||
jsonpointer==2.4
|
||||
jsonschema==4.19.2
|
||||
jsonschema-specifications==2023.11.1
|
||||
jupyter==1.0.0
|
||||
jupyter-console==6.6.3
|
||||
jupyter-events==0.9.0
|
||||
jupyter-lsp==2.2.0
|
||||
jupyter_client==8.6.0
|
||||
jupyter_core==5.5.0
|
||||
jupyter_server==2.10.1
|
||||
jupyter_server_terminals==0.4.4
|
||||
jupyterlab==4.0.8
|
||||
jupyterlab-pygments==0.2.2
|
||||
jupyterlab-widgets==3.0.9
|
||||
jupyterlab_server==2.25.1
|
||||
MarkupSafe==2.1.3
|
||||
matplotlib-inline==0.1.6
|
||||
mistune==3.0.2
|
||||
mypy-extensions==1.0.0
|
||||
nbclient==0.9.0
|
||||
nbconvert==7.11.0
|
||||
nbformat==5.9.2
|
||||
nest-asyncio==1.5.8
|
||||
notebook==7.0.6
|
||||
notebook_shim==0.2.3
|
||||
numpy==1.26.2
|
||||
overrides==7.4.0
|
||||
packaging==23.2
|
||||
pandas==2.1.3
|
||||
pandocfilters==1.5.0
|
||||
parso==0.8.3
|
||||
pathspec==0.11.2
|
||||
pexpect==4.8.0
|
||||
platformdirs==4.0.0
|
||||
prometheus-client==0.18.0
|
||||
prompt-toolkit==3.0.41
|
||||
psutil==5.9.6
|
||||
psycopg2==2.9.9
|
||||
ptyprocess==0.7.0
|
||||
pure-eval==0.2.2
|
||||
pycparser==2.21
|
||||
Pygments==2.16.1
|
||||
pyproj==3.6.1
|
||||
python-dateutil==2.8.2
|
||||
python-json-logger==2.0.7
|
||||
pytz==2023.3.post1
|
||||
PyYAML==6.0.1
|
||||
pyzmq==25.1.1
|
||||
qtconsole==5.5.1
|
||||
QtPy==2.4.1
|
||||
referencing==0.31.0
|
||||
requests==2.31.0
|
||||
rfc3339-validator==0.1.4
|
||||
rfc3986-validator==0.1.1
|
||||
rpds-py==0.13.0
|
||||
Send2Trash==1.8.2
|
||||
shapely==2.0.2
|
||||
six==1.16.0
|
||||
sniffio==1.3.0
|
||||
soupsieve==2.5
|
||||
SQLAlchemy==2.0.23
|
||||
stack-data==0.6.3
|
||||
terminado==0.18.0
|
||||
tinycss2==1.2.1
|
||||
tornado==6.3.3
|
||||
traitlets==5.13.0
|
||||
types-python-dateutil==2.8.19.14
|
||||
typing_extensions==4.8.0
|
||||
tzdata==2023.3
|
||||
uri-template==1.3.0
|
||||
urllib3==2.1.0
|
||||
wcwidth==0.2.10
|
||||
webcolors==1.13
|
||||
webencodings==0.5.1
|
||||
websocket-client==1.6.4
|
||||
widgetsnbextension==4.0.9
|
||||
139
src/data_utils.py
Normal file
139
src/data_utils.py
Normal file
@ -0,0 +1,139 @@
|
||||
import json
|
||||
import os
|
||||
import pandas as pd
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
import geopandas as gpd
|
||||
from concurrent.futures import ThreadPoolExecutor as tpe
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG, filename='logs/data_utils.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger('data_utils.py')
|
||||
stream_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
stream_handler.setFormatter(formatter)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
|
||||
def download_csv(url, local_filename):
|
||||
with requests.get(url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
with open(local_filename, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
|
||||
def process_urls(data_dir, urls_file):
|
||||
# Ensure the data directory exists
|
||||
if not os.path.exists(data_dir):
|
||||
os.makedirs(data_dir)
|
||||
|
||||
# Read URLs from the file
|
||||
with open(urls_file, 'r') as file:
|
||||
urls = file.readlines()
|
||||
|
||||
# Process each URL
|
||||
for url in urls:
|
||||
url = url.strip()
|
||||
filename = os.path.basename(urlparse(url).path)
|
||||
local_filename = os.path.join(data_dir, filename)
|
||||
|
||||
# Check if the file already exists
|
||||
if not os.path.isfile(local_filename):
|
||||
logger.debug(f"Downloading {url}...")
|
||||
download_csv(url, local_filename)
|
||||
logger.debug(f"Saved to {local_filename}")
|
||||
else:
|
||||
print(f"File {filename} already exists in {data_dir}, skipping download.")
|
||||
|
||||
|
||||
def load_dataframe_from_csv(filepath):
|
||||
try:
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
return df
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading {filepath}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def load_dataframes_from_csv_files(data_dir, u_string):
|
||||
dataframes = []
|
||||
|
||||
# with tpe(max_workers=5) as executor:
|
||||
# for filename in os.listdir(data_dir):
|
||||
# if (u_string in filename) and filename.endswith('.csv'):
|
||||
# filepath = os.path.join(data_dir, filename)
|
||||
# future = executor.submit(load_dataframe_from_csv, filepath)
|
||||
# dataframes.append(future)
|
||||
#
|
||||
# dataframes = [future.result() for future in dataframes if future.result() is not None]
|
||||
#
|
||||
# return dataframes
|
||||
|
||||
for filename in os.listdir(data_dir):
|
||||
if (u_string in filename) and filename.endswith('.csv'):
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
logger.debug(f'Duplicate Rows for {filename}: {df[df.duplicated()].shape[0]}')
|
||||
df = df.drop_duplicates()
|
||||
logger.debug(f'Duplicate Rows after DROPPING for {filename}: {df[df.duplicated()].shape[0]}')
|
||||
dataframes.append(df.drop_duplicates())
|
||||
return dataframes
|
||||
|
||||
|
||||
def load_dataframes_from_geojson_files(data_dir, u_string):
|
||||
print('u_string', u_string)
|
||||
gdf = gpd.GeoDataFrame()
|
||||
for filename in os.listdir(data_dir):
|
||||
#print("Filename:", filename)
|
||||
if (u_string in filename) and filename.endswith('.json'):
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
print("Filepath:", filepath)
|
||||
gdf = gpd.read_file(filepath)
|
||||
|
||||
return gdf
|
||||
|
||||
|
||||
def combine_dataframes(dataframes):
|
||||
if dataframes:
|
||||
combined_dataframe = pd.concat(dataframes, ignore_index=True)
|
||||
logger.debug(f'Duplicate Rows after combining: {combined_dataframe[combined_dataframe.duplicated()]}')
|
||||
return combined_dataframe
|
||||
else:
|
||||
print("No dataframes to combine")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def create_unified_df(urls_file, u_string, data_dir, files_present=False):
|
||||
df_list = []
|
||||
df_unified = None
|
||||
if not files_present:
|
||||
process_urls(data_dir, urls_file)
|
||||
|
||||
df_list = load_dataframes_from_csv_files(data_dir, u_string)
|
||||
df_unified = combine_dataframes(df_list)
|
||||
|
||||
return df_unified
|
||||
|
||||
|
||||
def load_file_from_api(api_link, target_name, integrated_dir):
|
||||
response = requests.get(api_link)
|
||||
final_location = os.path.join(integrated_dir, target_name)
|
||||
if response.status_code == 200:
|
||||
logger.info(f"Succesfull get from {api_link}")
|
||||
data = response.json()
|
||||
with open(f'{final_location}.geojson', 'w') as file:
|
||||
json.dump(data, file)
|
||||
logger.info(f"{api_link} successfully downloaded and saved to {final_location}")
|
||||
else:
|
||||
logger.critical(f"Failed to get data. Status Code: {response.status_code}")
|
||||
def save_dataframe_to_csv(df, integrated_dir, filename):
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
csv_urls_file = '../docs/all_csv_urls.txt'
|
||||
datasets_dir = 'datasets/'
|
||||
output_file = 'column_names.txt'
|
||||
process_urls(datasets_dir, csv_urls_file)
|
||||
# extract_column_names(datasets_dir, output_file)
|
||||
26
src/ensure_dirs_exist.py
Normal file
26
src/ensure_dirs_exist.py
Normal file
@ -0,0 +1,26 @@
|
||||
import logging
|
||||
import os
|
||||
"""
|
||||
The functionality of this script has been adapted from data_utils.ensure_dirs_exist().
|
||||
This needs to be run before any other script.
|
||||
"""
|
||||
data_dir = 'datasets/'
|
||||
integrated_dir = 'datasets/integrated/'
|
||||
logs_dir = 'logs/'
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger('integrate.py')
|
||||
stream_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
stream_handler.setFormatter(formatter)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}')
|
||||
logger.info("Ensuring needed directories exist.")
|
||||
os.makedirs(data_dir, exist_ok=True)
|
||||
logger.debug("data_dir created.")
|
||||
os.makedirs(integrated_dir, exist_ok=True)
|
||||
logger.debug("integrated_dir created")
|
||||
os.makedirs(logs_dir, exist_ok=True)
|
||||
logger.debug("logs_dir created")
|
||||
108
src/fill_db.py
Normal file
108
src/fill_db.py
Normal file
@ -0,0 +1,108 @@
|
||||
import logging
|
||||
import psycopg2
|
||||
import subprocess
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG, filename='logs/fill_db.log',
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger('fill_db.py')
|
||||
stream_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
stream_handler.setFormatter(formatter)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
integrated_dir = 'datasets/integrated/'
|
||||
accident_geojson_file = 'datasets/integrated/Accidents.geojson'
|
||||
signaled_speeds_file = 'datasets/integrated/signaled_speeds.geojson.geojson'
|
||||
accident_loader_script = 'load_accidents_into_db.sh'
|
||||
accident_table_name = 'accidents'
|
||||
signaled_speeds_table_name = 'signaled_speeds'
|
||||
|
||||
"""
|
||||
Make sure db_info contain the correct credentials
|
||||
"""
|
||||
db_info = {
|
||||
'host': 'localhost',
|
||||
'database': 'test-db23',
|
||||
'port': '5432',
|
||||
'user': 'seb',
|
||||
'password': '',
|
||||
}
|
||||
setup_tables_script = 'setup_tables.sql'
|
||||
load_csvs_into_db_script = 'load_csvs_into_db.sql'
|
||||
|
||||
|
||||
def run_sql(script, db_info):
|
||||
db_connection = psycopg2.connect(**db_info)
|
||||
db_cursor = db_connection.cursor()
|
||||
|
||||
with open(script, 'r') as sql_file:
|
||||
sql_script = sql_file.read()
|
||||
|
||||
try:
|
||||
db_cursor.execute(sql_script)
|
||||
db_connection.commit()
|
||||
logger.info(f'{script} executed successfully')
|
||||
except Exception as e:
|
||||
db_connection.rollback()
|
||||
logger.exception(f'Error executing {sql_script}: {e}')
|
||||
finally:
|
||||
db_cursor.close()
|
||||
db_connection.close()
|
||||
|
||||
|
||||
def run_geojson_loader_script(script, *args):
|
||||
try:
|
||||
cmd = ['bash', script] + list(args)
|
||||
res = subprocess.run(cmd, check=True, text=True, capture_output=True)
|
||||
logger.info(f'{script} executed successfully. Output: {res.stdout}')
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.exception(f'Error executing {script}: {e}')
|
||||
logger.info(f"Remember to set the correct permissions for the script: chmod +x {script}")
|
||||
|
||||
|
||||
def geojson_loader(*args, modus='append'):
|
||||
"""
|
||||
Use this instead of run_geojson_loader_script() in the main method to avoid the bash dependency.
|
||||
:param args: All the arguments needed for ogr2org to run properly
|
||||
:param modus: append or overwrite db table
|
||||
:return:
|
||||
"""
|
||||
geojson_file, db_name, db_user, db_password, db_host, db_port, target_table = args
|
||||
cmd = [
|
||||
"ogr2ogr",
|
||||
"-f", "PostgreSQL",
|
||||
f"PG:dbname='{db_name}' host='{db_host}' port='{db_port}' user='{db_user}' password='{db_password}'",
|
||||
geojson_file,
|
||||
"-nln", target_table,
|
||||
f"-{modus}"
|
||||
]
|
||||
try:
|
||||
# Run the command
|
||||
res = subprocess.run(cmd, check=True, text=True, capture_output=True)
|
||||
logger.info(f"ogr2ogr command executed successfully. Output: {res.stdout}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.exception(f"Error executing ogr2ogr command: {e}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_sql(setup_tables_script, db_info)
|
||||
logger.info("Finnished setting up tables.")
|
||||
run_sql(load_csvs_into_db_script, db_info)
|
||||
logger.info("Finnished loading csv into db.")
|
||||
run_geojson_loader_script(accident_loader_script,
|
||||
accident_geojson_file,
|
||||
db_info['database'],
|
||||
db_info['user'],
|
||||
db_info['password'],
|
||||
db_info['host'],
|
||||
db_info['port'],
|
||||
accident_table_name)
|
||||
logger.info('Finished loading accident geojson into db using bash script.')
|
||||
geojson_loader(signaled_speeds_file,
|
||||
db_info['database'],
|
||||
db_info['user'],
|
||||
db_info['password'],
|
||||
db_info['host'],
|
||||
db_info['port'],
|
||||
signaled_speeds_table_name,
|
||||
modus='overwrite')
|
||||
245
src/integrate.py
Normal file
245
src/integrate.py
Normal file
@ -0,0 +1,245 @@
|
||||
import data_utils as du
|
||||
import os
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
import time
|
||||
from shapely.geometry import Point
|
||||
import re
|
||||
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG, filename='logs/integrate.log',
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger('integrate.py')
|
||||
stream_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
stream_handler.setFormatter(formatter)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt'
|
||||
miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt'
|
||||
accident_file_url = '../docs/accident_loc_urls.txt'
|
||||
|
||||
# Using u_string to discriminate between files that belong to each other
|
||||
motor_file_u_string = 'sid_dav_verkehrszaehlung_miv_OD2031'
|
||||
foot_bike_file_u_string = 'velo.csv'
|
||||
accident_file_u_string = 'RoadTrafficAccidentLocations.json'
|
||||
|
||||
data_dir = 'datasets/'
|
||||
integrated_dir = 'datasets/integrated/'
|
||||
logs_dir = 'logs/'
|
||||
|
||||
signaled_speeds_json_api = 'https://www.ogd.stadt-zuerich.ch/wfs/geoportal/Signalisierte_Geschwindigkeiten?service=WFS&version=1.1.0&request=GetFeature&outputFormat=GeoJSON&typename=view_geoserver_tempo_ist'
|
||||
|
||||
weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
|
||||
|
||||
fb_data_types = {
|
||||
'ID': 'int',
|
||||
'NORD': 'int',
|
||||
'OST': 'int',
|
||||
'DATE': 'str',
|
||||
'HRS': 'int',
|
||||
'VELO_IN': 'int',
|
||||
'VELO_OUT': 'int',
|
||||
'FUSS_IN': 'int',
|
||||
'FUSS_OUT': 'int',
|
||||
'Weekday_en': 'str'
|
||||
}
|
||||
|
||||
miv_data_types = {
|
||||
'ID': 'int',
|
||||
'MSID': 'str',
|
||||
'ZSID': 'str',
|
||||
'Achse': 'str',
|
||||
'NKoord': 'int',
|
||||
'EKoord': 'int',
|
||||
'Richtung': 'str',
|
||||
'AnzFahrzeuge': 'int',
|
||||
'AnzFahrzeugeStatus': 'str',
|
||||
'Datum': 'str',
|
||||
'Hrs': 'int',
|
||||
'Weekday_en': 'str',
|
||||
}
|
||||
|
||||
acc_data_types = {
|
||||
'AccidentUID': 'str',
|
||||
'AccidentYear': 'int',
|
||||
'AccidentMonth': 'int',
|
||||
'AccidentWeekDay_en': 'str',
|
||||
'AccidentHour': 'int',
|
||||
'NKoord': 'int',
|
||||
'EKoord': 'int',
|
||||
'AccidentType_en': 'str',
|
||||
'AccidentType': 'str',
|
||||
'AccidentSeverityCategory': 'str',
|
||||
'AccidentInvolvingPedestrian': 'bool',
|
||||
'AccidentInvolvingBicycle': 'bool',
|
||||
'AccidentInvolvingMotorcycle': 'bool',
|
||||
'RoadType': 'str',
|
||||
'RoadType_en': 'str',
|
||||
'geometry': 'str' # TODO: Figure out what dtype this needs to be for postgres
|
||||
}
|
||||
# def ensure_dirs_exist(data_dir, integrated_dir, logs_dir):
|
||||
# """
|
||||
# This should be called before anything else to make sure that the relevant directories exists.
|
||||
# :param data_dir: directory where the datasets are stored
|
||||
# :param integrated_dir: directory where the integrated data will be stored
|
||||
# :return:
|
||||
# """
|
||||
# logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}')
|
||||
# logger.info("Ensuring needed directories exist.")
|
||||
# os.makedirs(data_dir, exist_ok=True)
|
||||
# logger.debug("data_dir created.")
|
||||
# os.makedirs(integrated_dir, exist_ok=True)
|
||||
# logger.debug("integrated_dir created")
|
||||
# os.makedirs(logs_dir, exist_ok=True)
|
||||
# logger.debug("logs_dir created")
|
||||
|
||||
|
||||
def process_foot_bike_data(files_present=True):
|
||||
fb_df_unified = du.create_unified_df(foot_bike_urls_file, foot_bike_file_u_string, data_dir,
|
||||
files_present=files_present)
|
||||
fb_df_unified[['DATE', "TIME"]] = fb_df_unified['DATUM'].str.split('T', expand=True)
|
||||
fb_df_unified[['HRS', 'MINS']] = fb_df_unified['TIME'].str.split(':', expand=True)
|
||||
## Evt brauchen wir doch FK_ZAEHLER
|
||||
fb_cols_to_drop = ['DATUM']
|
||||
fb_df_unified_correct_cols = fb_df_unified.drop(columns=fb_cols_to_drop, axis=1)
|
||||
fb_df_unified_correct_cols.fillna(0, inplace=True)
|
||||
fb_df_grouped = fb_df_unified_correct_cols.groupby(['OST', 'NORD', 'DATE', 'HRS']).agg({
|
||||
'VELO_IN': 'sum',
|
||||
'VELO_OUT': 'sum',
|
||||
'FUSS_IN': 'sum',
|
||||
'FUSS_OUT': 'sum'
|
||||
}).reset_index()
|
||||
dt_obj = pd.to_datetime(fb_df_grouped['DATE'])
|
||||
days = dt_obj.dt.weekday
|
||||
fb_df_grouped['Weekday_en'] = days.map(lambda x: weekday_names[x])
|
||||
cleaned_fb_df = fb_df_grouped
|
||||
cleaned_fb_df['ID'] = cleaned_fb_df.index + 1
|
||||
cleaned_fb_df = cleaned_fb_df[['ID', 'NORD', 'OST', 'DATE', 'HRS', 'VELO_IN', 'VELO_OUT', 'FUSS_IN',
|
||||
'FUSS_OUT', 'Weekday_en']]
|
||||
# Ensure datatype of df and sql table match
|
||||
cleaned_fb_df = cleaned_fb_df.astype(fb_data_types)
|
||||
return cleaned_fb_df
|
||||
|
||||
|
||||
def process_miv_data(files_present=True):
|
||||
miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir, files_present=files_present)
|
||||
|
||||
miv_df_unified[['Datum', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True)
|
||||
miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True)
|
||||
|
||||
miv_cols_to_keep = ['MSID','ZSID','Achse', 'NKoord', 'EKoord', 'Richtung', 'AnzFahrzeuge', 'AnzFahrzeugeStatus',
|
||||
'Datum', 'Hrs',]
|
||||
miv_df_cols_dropped = miv_df_unified[miv_cols_to_keep]
|
||||
|
||||
dt_obj = pd.to_datetime(miv_df_cols_dropped['Datum'])
|
||||
days = dt_obj.dt.weekday
|
||||
miv_df_cols_dropped.loc[:, 'Weekday_en'] = days.map(lambda x: weekday_names[x])
|
||||
|
||||
miv_df_cols_dropped.loc[:, 'AnzFahrzeuge'] = miv_df_cols_dropped['AnzFahrzeuge'].fillna(0).astype(int)
|
||||
miv_df_cols_dropped[:, 'ZSID'] = miv_df_cols_dropped['ZSID'].fillna('Missing').astype(str)
|
||||
miv_df_cols_dropped['ID'] = (miv_df_cols_dropped.index + 1).copy()
|
||||
|
||||
cleaned_miv_df = miv_df_cols_dropped[['ID', 'MSID', 'ZSID', 'Achse', 'NKoord', 'EKoord', 'Richtung', 'AnzFahrzeuge',
|
||||
'AnzFahrzeugeStatus', 'Datum', 'Hrs', 'Weekday_en']]
|
||||
|
||||
cleaned_miv_df = cleaned_miv_df.astype(miv_data_types)
|
||||
cleaned_miv_df = cleaned_miv_df.drop_duplicates()
|
||||
return cleaned_miv_df
|
||||
|
||||
|
||||
def process_accident_data(file_present: bool = True):
|
||||
if not file_present:
|
||||
du.process_urls(data_dir, accident_file_url)
|
||||
acc_df_unified = du.load_dataframes_from_geojson_files(data_dir, accident_file_u_string)
|
||||
acc_cols_to_keep = ['AccidentUID', 'AccidentYear', 'AccidentMonth', 'AccidentWeekDay_en','AccidentHour',
|
||||
'AccidentLocation_CHLV95_N', 'AccidentLocation_CHLV95_E', 'AccidentType_en', 'AccidentType',
|
||||
'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
|
||||
'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en',
|
||||
'geometry']
|
||||
cleaned_acc_df = acc_df_unified[acc_cols_to_keep]
|
||||
cleaned_acc_df.rename(columns={
|
||||
'AccidentLocation_CHLV95_E': 'EKoord',
|
||||
'AccidentLocation_CHLV95_N': 'NKoord',
|
||||
}, inplace=True)
|
||||
|
||||
cleaned_acc_df = cleaned_acc_df.astype(acc_data_types)
|
||||
return cleaned_acc_df
|
||||
|
||||
|
||||
def process_all_data_sources(fb_present=True, miv_present=True, accident_present=True):
|
||||
"""
|
||||
Process all data sources and turn them in to csv files. After this function is called there
|
||||
should be csv files of the cleaned and integrated data sources
|
||||
|
||||
:param fb_present: bool, if the files present in local file system
|
||||
:param miv_present: bool, if the files present in local file system
|
||||
:param accident_present: bool, if the files present in local file system
|
||||
:return:
|
||||
"""
|
||||
# ensure_dirs_exist(data_dir, integrated_dir)
|
||||
logger.info("Started processing all data sources.")
|
||||
fb_to_integrated(fb_present)
|
||||
|
||||
miv_to_integrated_csv(miv_present)
|
||||
|
||||
acc_to_cleaned_geojson(accident_present)
|
||||
|
||||
|
||||
def fb_to_integrated(files_present=True):
|
||||
|
||||
start_time = time.time()
|
||||
logger.info("Start processing pedestrian and bicycle data (FootBikeCount)")
|
||||
fb_count_df = process_foot_bike_data(files_present)
|
||||
logger.debug(f'FB Head:{fb_count_df.head()}\n FB dtypes: {fb_count_df.dtypes}')
|
||||
fb_file_path = os.path.join(integrated_dir, 'FootBikeCount.csv')
|
||||
logger.debug(f'FB Cleaned File Path: {fb_file_path}')
|
||||
fb_count_df.to_csv(fb_file_path, index=False)
|
||||
logger.info("FB integrated csv created.")
|
||||
end_time = time.time()
|
||||
logger.info(f'Time taken for FootBikeCount: {end_time-start_time}')
|
||||
|
||||
|
||||
def miv_to_integrated_csv(miv_present=True):
|
||||
|
||||
start_time2 = time.time()
|
||||
logger.info("Start processing motorized vehicle data (MivCount)")
|
||||
miv_count_df = process_miv_data(miv_present)
|
||||
logger.debug(f'MIV Head:{miv_count_df.head()}\n MIV dtypes: {miv_count_df.dtypes}')
|
||||
miv_file_path = os.path.join(integrated_dir, 'MivCount.csv')
|
||||
logger.debug(f'MIV Cleaned File Path: {miv_file_path}')
|
||||
miv_count_df.to_csv(miv_file_path, index=False)
|
||||
logger.info("MIV integrated csv created.")
|
||||
end_time = time.time()
|
||||
logger.info(f'Time taken for MivCount: {end_time-start_time2}')
|
||||
|
||||
|
||||
def acc_to_cleaned_geojson(acc_present=True):
|
||||
start_time3 = time.time()
|
||||
logger.info("Start processing accident data (Accidents)")
|
||||
acc_df = process_accident_data(acc_present)
|
||||
logger.debug(f'ACC Head: { acc_df.head()}\n Acc dtypes: {acc_df.dtypes}')
|
||||
acc_file_path = os.path.join(integrated_dir, 'Accidents.geojson')
|
||||
logger.debug(f'Acc Cleaned file path: {acc_file_path}')
|
||||
acc_df['geometry'] = acc_df['geometry'].apply(lambda row: re.findall(r"[-+]?\d*\.\d+|\d+", row))
|
||||
# Create a Point object using the extracted coordinates
|
||||
acc_df['geometry'] = acc_df['geometry'].apply(
|
||||
lambda coords: Point(float(coords[0]), float(coords[1]), float(coords[2])))
|
||||
acc_gdf = gpd.GeoDataFrame(acc_df, geometry='geometry')
|
||||
acc_gdf.to_file(acc_file_path, driver='GeoJSON')
|
||||
logger.info("ACC integrated csv created.")
|
||||
end_time = time.time()
|
||||
logger.info(f'Time taken for Accidents: {end_time - start_time3}')
|
||||
|
||||
|
||||
def load_tempo_geojson_from_api_to_local():
|
||||
du.load_file_from_api(signaled_speeds_json_api, 'signaled_speeds.geojson', integrated_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# ensure_dirs_exist(data_dir, integrated_dir, logs_dir)
|
||||
# process_all_data_sources(True, True, True)
|
||||
# miv_to_integrated_csv()
|
||||
# acc_to_cleaned_geojson()
|
||||
load_tempo_geojson_from_api_to_local()
|
||||
15
src/load_accidents_into_db.sh
Normal file
15
src/load_accidents_into_db.sh
Normal file
@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Define parameters
|
||||
GEOJSON_FILE=$1
|
||||
DB_NAME=$2
|
||||
DB_USER=$3
|
||||
DB_PASSWORD=$4
|
||||
DB_HOST=$5
|
||||
DB_PORT=$6
|
||||
TARGET_TABLE=$7
|
||||
|
||||
# Run ogr2ogr command
|
||||
ogr2ogr -f "PostgreSQL" PG:"dbname='$DB_NAME' host='$DB_HOST' port='$DB_PORT' user='$DB_USER' password='$DB_PASSWORD'" "$GEOJSON_FILE" -nln $TARGET_TABLE -append
|
||||
|
||||
echo "GeoJSON data has been imported into $TARGET_TABLE"
|
||||
7
src/load_csvs_into_db.sql
Normal file
7
src/load_csvs_into_db.sql
Normal file
@ -0,0 +1,7 @@
|
||||
COPY FootBikeCount FROM '/Users/seb/Projects/repos/group-1/src/datasets/integrated/FootBikeCount.csv'
|
||||
DELIMITER ','
|
||||
CSV HEADER;
|
||||
|
||||
COPY MivCount FROM '/Users/seb/Projects/repos/group-1/src/datasets/integrated/MivCount.csv'
|
||||
DELIMITER ','
|
||||
CSV HEADER;
|
||||
42
src/queries.sql
Normal file
42
src/queries.sql
Normal file
@ -0,0 +1,42 @@
|
||||
select p.id, a.accidentuid, m.id
|
||||
from footbikecount p, accidents a, mivcount m
|
||||
where p.weekday_en = a.accidentweekday_en AND a.accidentweekday_en = m.weekday_en
|
||||
AND p.weekday_en = m.weekday_en AND p.hrs = a.accidenthour AND a.accidenthour = m.hrs
|
||||
AND p.hrs = m.hrs AND (p.ost - m.ekoord between -100 AND 100) AND (p.nord - m.nkoord between -100 AND 100);
|
||||
|
||||
DROP TABLE IF EXISTS Contemporaneous2;
|
||||
|
||||
CREATE TABLE Contemporaneous2 (
|
||||
p_id INTEGER,
|
||||
accidentuid VARCHAR(256),
|
||||
m_id INTEGER,
|
||||
weekday_en VARCHAR(10),
|
||||
hrs INTEGER,
|
||||
distance DOUBLE PRECISION
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE Intermediate2 AS
|
||||
SELECT
|
||||
p.id AS p_id,
|
||||
a.accidentuid,
|
||||
m.id AS m_id,
|
||||
p.weekday_en,
|
||||
p.hrs,
|
||||
SQRT(POWER(p.ost - m.ekoord, 2) + POWER(p.nord - m.nkoord, 2)) AS distance
|
||||
FROM
|
||||
footbikecount p,
|
||||
accidents a,
|
||||
mivcount m
|
||||
WHERE
|
||||
p.weekday_en = a.accidentweekday_en
|
||||
AND a.accidentweekday_en = m.weekday_en
|
||||
AND p.weekday_en = m.weekday_en
|
||||
AND p.hrs = a.accidenthour
|
||||
AND a.accidenthour = m.hrs
|
||||
AND p.hrs = m.hrs
|
||||
AND (p.ost - m.ekoord BETWEEN -100 AND 100)
|
||||
AND (p.nord - m.nkoord BETWEEN -100 AND 100);
|
||||
|
||||
INSERT INTO Contemporaneous2 (p_id, accidentuid, m_id, weekday_en, hrs, distance)
|
||||
SELECT p_id, accidentuid, m_id, weekday_en, hrs, distance FROM Intermediate2;
|
||||
72
src/setup_tables.sql
Normal file
72
src/setup_tables.sql
Normal file
@ -0,0 +1,72 @@
|
||||
CREATE EXTENSION IF NOT EXISTS postgis;
|
||||
|
||||
DROP TABLE IF EXISTS FootBikeCount;
|
||||
|
||||
DROP TABLE IF EXISTS Accidents;
|
||||
|
||||
DROP TABLE IF EXISTS MivCount;
|
||||
|
||||
|
||||
CREATE TABLE FootBikeCount (
|
||||
ID INTEGER ,
|
||||
NORD INTEGER ,
|
||||
OST INT ,
|
||||
DATE VARCHAR(10) ,
|
||||
HRS INTEGER ,
|
||||
VELO_IN INTEGER ,
|
||||
VELO_OUT INTEGER ,
|
||||
FUSS_IN INTEGER ,
|
||||
FUSS_OUT INTEGER ,
|
||||
Weekday_en VARCHAR(10) ,
|
||||
|
||||
PRIMARY KEY(ID) ,
|
||||
CHECK (Weekday_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')),
|
||||
CHECK (Hrs BETWEEN 0 AND 23)
|
||||
|
||||
|
||||
);
|
||||
|
||||
|
||||
|
||||
CREATE TABLE MivCount (
|
||||
ID INTEGER ,
|
||||
MSID VARCHAR(10) ,
|
||||
ZSID VARCHAR(10) ,
|
||||
Achse VARCHAR(256) ,
|
||||
NKoord INTEGER ,
|
||||
EKoord INTEGER ,
|
||||
Richtung VARCHAR(100) ,
|
||||
AnzFahrzeuge INTEGER ,
|
||||
AnzFahrzeugeStatus VARCHAR(20) ,
|
||||
Datum VARCHAR(10) ,
|
||||
Hrs Integer ,
|
||||
Weekday_en VARCHAR(10),
|
||||
|
||||
PRIMARY KEY (ID),
|
||||
CHECK (Weekday_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')),
|
||||
CHECK (Hrs BETWEEN 0 AND 23)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE Accidents (
|
||||
AccidentUID VARCHAR(256) ,
|
||||
AccidentYear INTEGER ,
|
||||
AccidentMonth INTEGER,
|
||||
AccidentWeekDay_en VARCHAR(10) ,
|
||||
AccidentHour INTEGER ,
|
||||
NKoord INTEGER ,
|
||||
EKoord INTEGER ,
|
||||
AccidentType_en VARCHAR(256) ,
|
||||
AccidentType VARCHAR(4) ,
|
||||
AccidentSeverityCategory VARCHAR(4) ,
|
||||
AccidentInvolvingPedestrian BOOLEAN ,
|
||||
AccidentInvolvingBicycle BOOLEAN ,
|
||||
AccidentInvolvingMotorcycle BOOLEAN ,
|
||||
RoadType VARCHAR(5) ,
|
||||
RoadType_en VARCHAR(256) ,
|
||||
Geometry geometry(Point, 4326) ,
|
||||
|
||||
PRIMARY KEY (AccidentUID) ,
|
||||
CHECK ( AccidentHour BETWEEN 0 AND 23) ,
|
||||
CHECK (AccidentWeekDay_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))
|
||||
);
|
||||
233
src/testArea.ipynb
Normal file
233
src/testArea.ipynb
Normal file
@ -0,0 +1,233 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from datetime import datetime as dt\n",
|
||||
"\n",
|
||||
"import integrate as intr\n",
|
||||
"\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-03T15:47:10.110909Z",
|
||||
"start_time": "2023-12-03T15:47:09.656556Z"
|
||||
}
|
||||
},
|
||||
"id": "be55b25929d95559"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/seb/Projects/repos/group-1/src/integrate.py:132: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" miv_df_cols_dropped['Weekday_en'] = days.map(lambda x: weekday_names[x])\n",
|
||||
"/Users/seb/Projects/repos/group-1/src/integrate.py:133: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" miv_df_cols_dropped['AnzFahrzeuge'] = miv_df_cols_dropped['AnzFahrzeuge'].fillna(0).astype(int)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"miv_df = intr.process_miv_data()\n",
|
||||
"#fb_data = intr.process_foot_bike_data()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-03T15:49:07.561603Z",
|
||||
"start_time": "2023-12-03T15:47:14.759104Z"
|
||||
}
|
||||
},
|
||||
"id": "dd3831953afdeb72"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"121\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"duplicate_rows = miv_df[miv_df.duplicated()]\n",
|
||||
"print(duplicate_rows.shape[0])"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-03T15:51:21.158909Z",
|
||||
"start_time": "2023-12-03T15:51:15.711222Z"
|
||||
}
|
||||
},
|
||||
"id": "14471cd78389ce4d"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"acc_df = intr.process_accident_data(True)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "f86bc612060b17a4"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"acc_df.head()\n",
|
||||
"acc_df['AccidentWeekDay_en'].unique()\n",
|
||||
"#acc_df.dtypes\n",
|
||||
"\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "6affbeea6c7cf3ef"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Accident Columns:\")\n",
|
||||
"print(acc_df.dtypes)\n",
|
||||
"print()\n",
|
||||
"print(\"MIV Columns:\")\n",
|
||||
"print(miv_df.dtypes)\n",
|
||||
"print()\n",
|
||||
"print(\"FB Cols:\")\n",
|
||||
"print(fb_data.dtypes)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "242041cd369d8454"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"acc_df['ID'] = acc_df.index +1\n",
|
||||
"acc_df[('ID')]"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "1841925ee109a417"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"MIV unqiue:\", miv_df['EKoord'])\n",
|
||||
"print(\"Acc unique:\", acc_df['RoadType'].unique)\n",
|
||||
"print(\"FB unique: \", fb_data['DATE'])\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "f6d752ea17eda341"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"acc_df.head()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "a159cafa9c227b88"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sqlalchemy import create_engine\n",
|
||||
"from geoalchemy2 import Geometry, WKTElement\n",
|
||||
"import geopandas as gpd\n",
|
||||
"from shapely import wkt\n",
|
||||
"\n",
|
||||
"db_url = f'postgresql://seb:@localhost:5432/test-db23'\n",
|
||||
"engine = create_engine(db_url)\n",
|
||||
"\n",
|
||||
"#miv_df.to_sql('table_name', engine, if_exists='replace', index=False)\n",
|
||||
"#fb_data.to_sql('footbike', engine, if_exists='replace', index=False)\n",
|
||||
"\n",
|
||||
"geometry_column = 'geometry'\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"acc_df['geometry'] = acc_df['geometry'].apply(lambda geom: WKTElement(geom.wkt, srid=4326))\n",
|
||||
"\n",
|
||||
"acc_df.to_sql('accidents', engine, if_exists='replace', index=False, dtype={'geometry': Geometry('POINT', srid=4326)})\n",
|
||||
"\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "fa76af8343443d7a"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"engine.dispose()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "bc0a23a5126e76c2"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user