ADD ID to fb.

This commit is contained in:
Sebastian Lenzlinger 2023-12-03 13:18:01 +01:00
parent 65bd9480e7
commit 94ee3cc3b0
2 changed files with 42 additions and 166 deletions

View File

@ -43,6 +43,7 @@ def process_foot_bike_data():
days = dt_obj.dt.weekday
fb_df_grouped['Weekday_en'] = days.map(lambda x: weekday_names[x])
cleaned_fb_df = fb_df_grouped
cleaned_fb_df['ID'] = cleaned_fb_df.index + 1
return cleaned_fb_df
@ -65,7 +66,7 @@ def process_miv_data():
return cleaned_miv_df
def process_accident_data(file_present: bool):
def process_accident_data(file_present: bool = True):
if not file_present:
du.process_urls(data_dir, accident_file_url)
acc_df_unified = du.load_dataframes_from_geojson_files(data_dir, accident_file_u_string)
@ -74,10 +75,14 @@ def process_accident_data(file_present: bool):
'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en', 'AccidentLocation_CHLV95_E',
'AccidentLocation_CHLV95_N', 'AccidentMonth', 'geometry']
cleaned_acc_df = acc_df_unified[acc_cols_to_keep]
cleaned_acc_df.rename(columns={
'AccidentLocation_CHLV95_E': 'EKoord',
'AccidentLocation_CHLV95_N': 'NKoord',
}, inplace=True)
return cleaned_acc_df
if __name__ == '__main__':
acc_df = process_accident_data(False)
acc_df = process_accident_data(True)
print(acc_df.dtypes)
print(acc_df.head(100))

View File

@ -13,29 +13,16 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-03T10:58:50.698090Z",
"start_time": "2023-12-03T10:58:50.384352Z"
"end_time": "2023-12-03T12:17:02.386525Z",
"start_time": "2023-12-03T12:17:01.722469Z"
}
},
"id": "be55b25929d95559"
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/seb/Projects/repos/group-1/src/integrate.py:55: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" miv_df_cols_dropped['Weekday_en'] = days.map(lambda x: weekday_names[x])\n"
]
}
],
"execution_count": null,
"outputs": [],
"source": [
"\n",
"miv_df = intr.process_miv_data()\n",
@ -43,129 +30,60 @@
],
"metadata": {
"collapsed": false,
"is_executing": true,
"ExecuteTime": {
"end_time": "2023-12-03T11:01:14.422749Z",
"start_time": "2023-12-03T10:58:52.300667Z"
"start_time": "2023-12-03T12:17:04.199209Z"
}
},
"id": "dd3831953afdeb72"
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"outputs": [],
"source": [
"test_df = miv_df\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-02T23:48:08.239957Z",
"start_time": "2023-12-02T23:48:08.230590Z"
}
"collapsed": false
},
"id": "14471cd78389ce4d"
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"outputs": [],
"source": [
"test_df.dtypes\n",
"date_object = pd.to_datetime(test_df['Date'])\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-02T23:48:09.754205Z",
"start_time": "2023-12-02T23:48:08.232651Z"
}
"collapsed": false
},
"id": "c70d21adef38fd68"
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"ename": "KeyError",
"evalue": "'Weekday_Name'",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)",
"File \u001B[0;32m~/Projects/repos/group-1/db23-project-venv/lib/python3.11/site-packages/pandas/core/indexes/base.py:3790\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3789\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m-> 3790\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 3791\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n",
"File \u001B[0;32mindex.pyx:152\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[0;34m()\u001B[0m\n",
"File \u001B[0;32mindex.pyx:181\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[0;34m()\u001B[0m\n",
"File \u001B[0;32mpandas/_libs/hashtable_class_helper.pxi:7080\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[0;34m()\u001B[0m\n",
"File \u001B[0;32mpandas/_libs/hashtable_class_helper.pxi:7088\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[0;34m()\u001B[0m\n",
"\u001B[0;31mKeyError\u001B[0m: 'Weekday_Name'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[5], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mtest_df\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mWeekday_Name\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m\n",
"File \u001B[0;32m~/Projects/repos/group-1/db23-project-venv/lib/python3.11/site-packages/pandas/core/frame.py:3893\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3891\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[1;32m 3892\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[0;32m-> 3893\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 3894\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[1;32m 3895\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n",
"File \u001B[0;32m~/Projects/repos/group-1/db23-project-venv/lib/python3.11/site-packages/pandas/core/indexes/base.py:3797\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3792\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(casted_key, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m (\n\u001B[1;32m 3793\u001B[0m \u001B[38;5;28misinstance\u001B[39m(casted_key, abc\u001B[38;5;241m.\u001B[39mIterable)\n\u001B[1;32m 3794\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m casted_key)\n\u001B[1;32m 3795\u001B[0m ):\n\u001B[1;32m 3796\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m InvalidIndexError(key)\n\u001B[0;32m-> 3797\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[1;32m 3798\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[1;32m 3799\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[1;32m 3800\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[1;32m 3801\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[1;32m 3802\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n",
"\u001B[0;31mKeyError\u001B[0m: 'Weekday_Name'"
]
}
],
"execution_count": null,
"outputs": [],
"source": [
"test_df['Weekday_Name']"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-02T23:48:10.103198Z",
"start_time": "2023-12-02T23:48:09.756006Z"
}
"collapsed": false
},
"id": "d0df3c0ef49e8061"
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"u_string RoadTrafficAccidentLocations.json\n",
"Filename: 2017_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: RoadTrafficAccidentLocations.json\n",
"Filepath: datasets/RoadTrafficAccidentLocations.json\n",
"Filename: 2016_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: 2022_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: 2015_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: 2019_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2013.csv\n",
"Filename: 2021_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2012.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2016.csv\n",
"Filename: 2014_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: 2018_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2017.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2015.csv\n",
"Filename: 2020_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2014.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2019.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2018.csv\n",
"Filename: 2013_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2022.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2020.csv\n",
"Filename: 2012_verkehrszaehlungen_werte_fussgaenger_velo.csv\n",
"Filename: sid_dav_verkehrszaehlung_miv_OD2031_2021.csv\n"
]
}
],
"execution_count": null,
"outputs": [],
"source": [
"acc_df = intr.process_accident_data()"
"acc_df = intr.process_accident_data(True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-03T11:15:51.051154Z",
"start_time": "2023-12-03T11:15:36.154717Z"
}
"is_executing": true
},
"id": "f86bc612060b17a4"
},
@ -175,15 +93,12 @@
"outputs": [],
"source": [
"acc_df.head()\n",
"acc_df['AccidentWeekDay'].unique()\n",
"acc_df['AccidentWeekDay_en'].unique()\n",
"#acc_df.dtypes\n",
"date_obj = dt.strptime(acc_df[''])\n"
"\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2023-12-02T23:48:10.101387Z"
}
"collapsed": false
},
"id": "6affbeea6c7cf3ef"
},
@ -203,9 +118,7 @@
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2023-12-02T23:48:10.102789Z"
}
"is_executing": true
},
"id": "242041cd369d8454"
},
@ -213,12 +126,12 @@
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"source": [
"acc_df['ID'] = acc_df.index +1\n",
"acc_df[('ID')]"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2023-12-02T23:48:10.103954Z"
}
"collapsed": false
},
"id": "1841925ee109a417"
},
@ -232,60 +145,26 @@
"print(\"FB unique: \", fb_data['OST'].unique())\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2023-12-02T23:48:10.104894Z"
}
"collapsed": false
},
"id": "f6d752ea17eda341"
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"data": {
"text/plain": " AccidentUID AccidentHour AccidentYear \\\n0 A2D2677533867004E0430A865E337004 00 2011 \n1 9FD6441F802C20A6E0430A865E3320A6 01 2011 \n2 9FDA0DC4856A6094E0430A865E336094 02 2011 \n3 A3B66E42396E6000E0430A865E336000 02 2011 \n4 9FDA0DBE8CCE9096E0430A865E339096 03 2011 \n\n AccidentWeekDay_en AccidentType AccidentSeverityCategory \\\n0 Saturday at0 as4 \n1 Saturday at0 as3 \n2 Saturday at0 as4 \n3 Saturday at5 as3 \n4 Saturday at0 as4 \n\n AccidentInvolvingPedestrian AccidentInvolvingBicycle \\\n0 false false \n1 false true \n2 false false \n3 false false \n4 false false \n\n AccidentInvolvingMotorcycle RoadType RoadType_en AccidentLocation_CHLV95_E \\\n0 false rt433 Minor road 2684605 \n1 false rt433 Minor road 2682382 \n2 false rt439 Other 2682791 \n3 false rt433 Minor road 2681199 \n4 false rt433 Minor road 2682479 \n\n AccidentLocation_CHLV95_N geometry \n0 1245194 POINT(8.55841025 47.3521677) \n1 1246980 POINT(8.52932024 47.36851152) \n2 1247749 POINT(8.5348767 47.37537618) \n3 1247102 POINT(8.51368203 47.36975554) \n4 1250690 POINT(8.53128819 47.40186473) ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>AccidentUID</th>\n <th>AccidentHour</th>\n <th>AccidentYear</th>\n <th>AccidentWeekDay_en</th>\n <th>AccidentType</th>\n <th>AccidentSeverityCategory</th>\n <th>AccidentInvolvingPedestrian</th>\n <th>AccidentInvolvingBicycle</th>\n <th>AccidentInvolvingMotorcycle</th>\n <th>RoadType</th>\n <th>RoadType_en</th>\n <th>AccidentLocation_CHLV95_E</th>\n <th>AccidentLocation_CHLV95_N</th>\n <th>geometry</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>A2D2677533867004E0430A865E337004</td>\n <td>00</td>\n <td>2011</td>\n <td>Saturday</td>\n <td>at0</td>\n <td>as4</td>\n <td>false</td>\n <td>false</td>\n <td>false</td>\n <td>rt433</td>\n <td>Minor road</td>\n <td>2684605</td>\n <td>1245194</td>\n <td>POINT(8.55841025 47.3521677)</td>\n </tr>\n <tr>\n <th>1</th>\n <td>9FD6441F802C20A6E0430A865E3320A6</td>\n <td>01</td>\n <td>2011</td>\n <td>Saturday</td>\n <td>at0</td>\n <td>as3</td>\n <td>false</td>\n <td>true</td>\n <td>false</td>\n <td>rt433</td>\n <td>Minor road</td>\n <td>2682382</td>\n <td>1246980</td>\n <td>POINT(8.52932024 47.36851152)</td>\n </tr>\n <tr>\n <th>2</th>\n <td>9FDA0DC4856A6094E0430A865E336094</td>\n <td>02</td>\n <td>2011</td>\n <td>Saturday</td>\n <td>at0</td>\n <td>as4</td>\n <td>false</td>\n <td>false</td>\n <td>false</td>\n <td>rt439</td>\n <td>Other</td>\n <td>2682791</td>\n <td>1247749</td>\n <td>POINT(8.5348767 47.37537618)</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A3B66E42396E6000E0430A865E336000</td>\n <td>02</td>\n <td>2011</td>\n <td>Saturday</td>\n <td>at5</td>\n <td>as3</td>\n <td>false</td>\n <td>false</td>\n <td>false</td>\n <td>rt433</td>\n <td>Minor road</td>\n <td>2681199</td>\n <td>1247102</td>\n <td>POINT(8.51368203 47.36975554)</td>\n </tr>\n <tr>\n <th>4</th>\n <td>9FDA0DBE8CCE9096E0430A865E339096</td>\n <td>03</td>\n <td>2011</td>\n <td>Saturday</td>\n <td>at0</td>\n <td>as4</td>\n <td>false</td>\n <td>false</td>\n <td>false</td>\n <td>rt433</td>\n <td>Minor road</td>\n <td>2682479</td>\n <td>1250690</td>\n <td>POINT(8.53128819 47.40186473)</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"outputs": [],
"source": [
"acc_df.head()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-02T23:53:22.460557Z",
"start_time": "2023-12-02T23:53:22.453434Z"
}
"collapsed": false
},
"id": "a159cafa9c227b88"
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/s3/8bc7ys2d24lgqhdlpttvp70r0000gn/T/ipykernel_59953/958527375.py:15: UserWarning: Geometry column does not contain geometry.\n",
" acc_df['geometry'] = acc_df['geometry'].apply(lambda geom: WKTElement(geom.wkt, srid=4326))\n"
]
},
{
"data": {
"text/plain": "826"
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"outputs": [],
"source": [
"from sqlalchemy import create_engine\n",
"from geoalchemy2 import Geometry, WKTElement\n",
@ -307,27 +186,19 @@
"\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-03T00:00:35.257439Z",
"start_time": "2023-12-03T00:00:32.802219Z"
}
"collapsed": false
},
"id": "fa76af8343443d7a"
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"outputs": [],
"source": [
"engine.dispose()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-03T00:00:40.409019Z",
"start_time": "2023-12-03T00:00:40.406193Z"
}
"collapsed": false
},
"id": "bc0a23a5126e76c2"
}