From 94ee3cc3b0805b9ef55ac8b8e655016bfb5767d7 Mon Sep 17 00:00:00 2001 From: Sebastian Lenzlinger <74497638+sebaschi@users.noreply.github.com> Date: Sun, 3 Dec 2023 13:18:01 +0100 Subject: [PATCH] ADD ID to fb. --- src/integrate.py | 9 +- src/testArea.ipynb | 199 ++++++++------------------------------------- 2 files changed, 42 insertions(+), 166 deletions(-) diff --git a/src/integrate.py b/src/integrate.py index bdc39d5..41c5b64 100644 --- a/src/integrate.py +++ b/src/integrate.py @@ -43,6 +43,7 @@ def process_foot_bike_data(): days = dt_obj.dt.weekday fb_df_grouped['Weekday_en'] = days.map(lambda x: weekday_names[x]) cleaned_fb_df = fb_df_grouped + cleaned_fb_df['ID'] = cleaned_fb_df.index + 1 return cleaned_fb_df @@ -65,7 +66,7 @@ def process_miv_data(): return cleaned_miv_df -def process_accident_data(file_present: bool): +def process_accident_data(file_present: bool = True): if not file_present: du.process_urls(data_dir, accident_file_url) acc_df_unified = du.load_dataframes_from_geojson_files(data_dir, accident_file_u_string) @@ -74,10 +75,14 @@ def process_accident_data(file_present: bool): 'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', 'AccidentLocation_CHLV95_E', 'AccidentLocation_CHLV95_N', 'AccidentMonth', 'geometry'] cleaned_acc_df = acc_df_unified[acc_cols_to_keep] + cleaned_acc_df.rename(columns={ + 'AccidentLocation_CHLV95_E': 'EKoord', + 'AccidentLocation_CHLV95_N': 'NKoord', + }, inplace=True) return cleaned_acc_df if __name__ == '__main__': - acc_df = process_accident_data(False) + acc_df = process_accident_data(True) print(acc_df.dtypes) print(acc_df.head(100)) diff --git a/src/testArea.ipynb b/src/testArea.ipynb index c4739bb..3104921 100644 --- a/src/testArea.ipynb +++ b/src/testArea.ipynb @@ -13,29 +13,16 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-12-03T10:58:50.698090Z", - "start_time": "2023-12-03T10:58:50.384352Z" + "end_time": "2023-12-03T12:17:02.386525Z", + "start_time": "2023-12-03T12:17:01.722469Z" } }, "id": "be55b25929d95559" }, { "cell_type": "code", - "execution_count": 2, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/seb/Projects/repos/group-1/src/integrate.py:55: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " miv_df_cols_dropped['Weekday_en'] = days.map(lambda x: weekday_names[x])\n" - ] - } - ], + "execution_count": null, + "outputs": [], "source": [ "\n", "miv_df = intr.process_miv_data()\n", @@ -43,129 +30,60 @@ ], "metadata": { "collapsed": false, + "is_executing": true, "ExecuteTime": { - "end_time": "2023-12-03T11:01:14.422749Z", - "start_time": "2023-12-03T10:58:52.300667Z" + "start_time": "2023-12-03T12:17:04.199209Z" } }, "id": "dd3831953afdeb72" }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "outputs": [], "source": [ "test_df = miv_df\n" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-12-02T23:48:08.239957Z", - "start_time": "2023-12-02T23:48:08.230590Z" - } + "collapsed": false }, "id": "14471cd78389ce4d" }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "outputs": [], "source": [ "test_df.dtypes\n", "date_object = pd.to_datetime(test_df['Date'])\n" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-12-02T23:48:09.754205Z", - "start_time": "2023-12-02T23:48:08.232651Z" - } + "collapsed": false }, "id": "c70d21adef38fd68" }, { "cell_type": "code", - "execution_count": 5, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'Weekday_Name'", - "output_type": "error", - "traceback": [ - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)", - "File \u001B[0;32m~/Projects/repos/group-1/db23-project-venv/lib/python3.11/site-packages/pandas/core/indexes/base.py:3790\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3789\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m-> 3790\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 3791\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n", - "File \u001B[0;32mindex.pyx:152\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[0;34m()\u001B[0m\n", - "File \u001B[0;32mindex.pyx:181\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[0;34m()\u001B[0m\n", - "File \u001B[0;32mpandas/_libs/hashtable_class_helper.pxi:7080\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[0;34m()\u001B[0m\n", - "File \u001B[0;32mpandas/_libs/hashtable_class_helper.pxi:7088\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[0;34m()\u001B[0m\n", - "\u001B[0;31mKeyError\u001B[0m: 'Weekday_Name'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[0;32mIn[5], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mtest_df\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mWeekday_Name\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m\n", - "File \u001B[0;32m~/Projects/repos/group-1/db23-project-venv/lib/python3.11/site-packages/pandas/core/frame.py:3893\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3891\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[1;32m 3892\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[0;32m-> 3893\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 3894\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[1;32m 3895\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n", - "File \u001B[0;32m~/Projects/repos/group-1/db23-project-venv/lib/python3.11/site-packages/pandas/core/indexes/base.py:3797\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3792\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(casted_key, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m (\n\u001B[1;32m 3793\u001B[0m \u001B[38;5;28misinstance\u001B[39m(casted_key, abc\u001B[38;5;241m.\u001B[39mIterable)\n\u001B[1;32m 3794\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m casted_key)\n\u001B[1;32m 3795\u001B[0m ):\n\u001B[1;32m 3796\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m InvalidIndexError(key)\n\u001B[0;32m-> 3797\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[1;32m 3798\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[1;32m 3799\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[1;32m 3800\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[1;32m 3801\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[1;32m 3802\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n", - "\u001B[0;31mKeyError\u001B[0m: 'Weekday_Name'" - ] - } - ], + "execution_count": null, + "outputs": [], "source": [ "test_df['Weekday_Name']" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-12-02T23:48:10.103198Z", - "start_time": "2023-12-02T23:48:09.756006Z" - } + "collapsed": false }, "id": "d0df3c0ef49e8061" }, { "cell_type": "code", - "execution_count": 3, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "u_string RoadTrafficAccidentLocations.json\n", - "Filename: 2017_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: RoadTrafficAccidentLocations.json\n", - "Filepath: datasets/RoadTrafficAccidentLocations.json\n", - "Filename: 2016_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: 2022_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: 2015_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: 2019_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2013.csv\n", - "Filename: 2021_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2012.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2016.csv\n", - "Filename: 2014_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: 2018_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2017.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2015.csv\n", - "Filename: 2020_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2014.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2019.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2018.csv\n", - "Filename: 2013_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2022.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2020.csv\n", - "Filename: 2012_verkehrszaehlungen_werte_fussgaenger_velo.csv\n", - "Filename: sid_dav_verkehrszaehlung_miv_OD2031_2021.csv\n" - ] - } - ], + "execution_count": null, + "outputs": [], "source": [ - "acc_df = intr.process_accident_data()" + "acc_df = intr.process_accident_data(True)" ], "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2023-12-03T11:15:51.051154Z", - "start_time": "2023-12-03T11:15:36.154717Z" - } + "is_executing": true }, "id": "f86bc612060b17a4" }, @@ -175,15 +93,12 @@ "outputs": [], "source": [ "acc_df.head()\n", - "acc_df['AccidentWeekDay'].unique()\n", + "acc_df['AccidentWeekDay_en'].unique()\n", "#acc_df.dtypes\n", - "date_obj = dt.strptime(acc_df[''])\n" + "\n" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-12-02T23:48:10.101387Z" - } + "collapsed": false }, "id": "6affbeea6c7cf3ef" }, @@ -203,9 +118,7 @@ ], "metadata": { "collapsed": false, - "ExecuteTime": { - "start_time": "2023-12-02T23:48:10.102789Z" - } + "is_executing": true }, "id": "242041cd369d8454" }, @@ -213,12 +126,12 @@ "cell_type": "code", "execution_count": null, "outputs": [], - "source": [], + "source": [ + "acc_df['ID'] = acc_df.index +1\n", + "acc_df[('ID')]" + ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-12-02T23:48:10.103954Z" - } + "collapsed": false }, "id": "1841925ee109a417" }, @@ -232,60 +145,26 @@ "print(\"FB unique: \", fb_data['OST'].unique())\n" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-12-02T23:48:10.104894Z" - } + "collapsed": false }, "id": "f6d752ea17eda341" }, { "cell_type": "code", - "execution_count": 9, - "outputs": [ - { - "data": { - "text/plain": " AccidentUID AccidentHour AccidentYear \\\n0 A2D2677533867004E0430A865E337004 00 2011 \n1 9FD6441F802C20A6E0430A865E3320A6 01 2011 \n2 9FDA0DC4856A6094E0430A865E336094 02 2011 \n3 A3B66E42396E6000E0430A865E336000 02 2011 \n4 9FDA0DBE8CCE9096E0430A865E339096 03 2011 \n\n AccidentWeekDay_en AccidentType AccidentSeverityCategory \\\n0 Saturday at0 as4 \n1 Saturday at0 as3 \n2 Saturday at0 as4 \n3 Saturday at5 as3 \n4 Saturday at0 as4 \n\n AccidentInvolvingPedestrian AccidentInvolvingBicycle \\\n0 false false \n1 false true \n2 false false \n3 false false \n4 false false \n\n AccidentInvolvingMotorcycle RoadType RoadType_en AccidentLocation_CHLV95_E \\\n0 false rt433 Minor road 2684605 \n1 false rt433 Minor road 2682382 \n2 false rt439 Other 2682791 \n3 false rt433 Minor road 2681199 \n4 false rt433 Minor road 2682479 \n\n AccidentLocation_CHLV95_N geometry \n0 1245194 POINT(8.55841025 47.3521677) \n1 1246980 POINT(8.52932024 47.36851152) \n2 1247749 POINT(8.5348767 47.37537618) \n3 1247102 POINT(8.51368203 47.36975554) \n4 1250690 POINT(8.53128819 47.40186473) ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
AccidentUIDAccidentHourAccidentYearAccidentWeekDay_enAccidentTypeAccidentSeverityCategoryAccidentInvolvingPedestrianAccidentInvolvingBicycleAccidentInvolvingMotorcycleRoadTypeRoadType_enAccidentLocation_CHLV95_EAccidentLocation_CHLV95_Ngeometry
0A2D2677533867004E0430A865E337004002011Saturdayat0as4falsefalsefalsert433Minor road26846051245194POINT(8.55841025 47.3521677)
19FD6441F802C20A6E0430A865E3320A6012011Saturdayat0as3falsetruefalsert433Minor road26823821246980POINT(8.52932024 47.36851152)
29FDA0DC4856A6094E0430A865E336094022011Saturdayat0as4falsefalsefalsert439Other26827911247749POINT(8.5348767 47.37537618)
3A3B66E42396E6000E0430A865E336000022011Saturdayat5as3falsefalsefalsert433Minor road26811991247102POINT(8.51368203 47.36975554)
49FDA0DBE8CCE9096E0430A865E339096032011Saturdayat0as4falsefalsefalsert433Minor road26824791250690POINT(8.53128819 47.40186473)
\n
" - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ "acc_df.head()" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-12-02T23:53:22.460557Z", - "start_time": "2023-12-02T23:53:22.453434Z" - } + "collapsed": false }, "id": "a159cafa9c227b88" }, { "cell_type": "code", - "execution_count": 21, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/s3/8bc7ys2d24lgqhdlpttvp70r0000gn/T/ipykernel_59953/958527375.py:15: UserWarning: Geometry column does not contain geometry.\n", - " acc_df['geometry'] = acc_df['geometry'].apply(lambda geom: WKTElement(geom.wkt, srid=4326))\n" - ] - }, - { - "data": { - "text/plain": "826" - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ "from sqlalchemy import create_engine\n", "from geoalchemy2 import Geometry, WKTElement\n", @@ -307,27 +186,19 @@ "\n" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-12-03T00:00:35.257439Z", - "start_time": "2023-12-03T00:00:32.802219Z" - } + "collapsed": false }, "id": "fa76af8343443d7a" }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "outputs": [], "source": [ "engine.dispose()" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-12-03T00:00:40.409019Z", - "start_time": "2023-12-03T00:00:40.406193Z" - } + "collapsed": false }, "id": "bc0a23a5126e76c2" }