Drop duplicates in unified miv csv.

This commit is contained in:
Sebastian Lenzlinger 2023-12-03 17:05:09 +01:00
parent c33ca87aaf
commit 920f725d74
3 changed files with 54 additions and 190 deletions

View File

@ -125,7 +125,7 @@ def process_miv_data(files_present=True):
miv_cols_to_keep = ['MSID','ZSID','Achse', 'EKoord', 'NKoord', 'Richtung', 'AnzFahrzeuge', 'AnzFahrzeugeStatus', miv_cols_to_keep = ['MSID','ZSID','Achse', 'EKoord', 'NKoord', 'Richtung', 'AnzFahrzeuge', 'AnzFahrzeugeStatus',
'Datum', 'Hrs'] 'Datum', 'Hrs']
miv_df_cols_dropped = miv_df_unified[miv_cols_to_keep] miv_df_cols_dropped = miv_df_unified#[miv_cols_to_keep]
dt_obj = pd.to_datetime(miv_df_cols_dropped['Datum']) dt_obj = pd.to_datetime(miv_df_cols_dropped['Datum'])
days = dt_obj.dt.weekday days = dt_obj.dt.weekday
@ -136,6 +136,7 @@ def process_miv_data(files_present=True):
'AnzFahrzeugeStatus', 'Datum', 'Hrs', 'Weekday_en']] 'AnzFahrzeugeStatus', 'Datum', 'Hrs', 'Weekday_en']]
cleaned_miv_df = cleaned_miv_df.astype(miv_data_types) cleaned_miv_df = cleaned_miv_df.astype(miv_data_types)
cleaned_miv_df = cleaned_miv_df.drop_duplicates()
return cleaned_miv_df return cleaned_miv_df
@ -222,3 +223,7 @@ def miv_to_integrated_csv(miv_present=True):
if __name__ == '__main__': if __name__ == '__main__':
#process_all_data_sources(True, True, True) #process_all_data_sources(True, True, True)
miv_to_integrated_csv() miv_to_integrated_csv()
# path = os.path.join(integrated_dir, 'MivCount.csv')
# df = pd.read_csv(path)
# duplicate_rows = df[df.duplicated()]
# print(duplicate_rows.shape[0])

View File

@ -35,7 +35,7 @@ CREATE TABLE MivCount (
Datum VARCHAR(10) , Datum VARCHAR(10) ,
Hrs Integer , Hrs Integer ,
Weekday_en VARCHAR(10), Weekday_en VARCHAR(10),
PRIMARY KEY (MSID), PRIMARY KEY (MSID, Achse,Richtung, Datum, Hrs),
CHECK (Weekday_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')), CHECK (Weekday_en IN ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')),
CHECK (Hrs BETWEEN 0 AND 23) CHECK (Hrs BETWEEN 0 AND 23)
); );
@ -44,7 +44,7 @@ CREATE TABLE MivCount (
DROP TABLE IF EXISTS Accidents; DROP TABLE IF EXISTS Accidents;
CREATE TABLE Accidents ( CREATE TABLE Accidents (
AccidentUID VARCHAR(32) , AccidentUID VARCHAR(256) ,
AccidentYear INTEGER , AccidentYear INTEGER ,
AccidentMonth INTEGER, AccidentMonth INTEGER,
AccidentWeekDay_en VARCHAR(10) , AccidentWeekDay_en VARCHAR(10) ,
@ -67,5 +67,9 @@ CREATE TABLE Accidents (
); );
COPY FootBikeCount FROM '/Users/seb/Projects/repos/group-1/src/datasets/integrated/FootBikeCount.csv' COPY FootBikeCount FROM '/Users/seb/Projects/repos/group-1/src/datasets/integrated/FootBikeCount.csv'
DELIMITER ','
CSV HEADER;
COPY MivCount FROM '/Users/seb/Projects/repos/group-1/src/datasets/integrated/MivCount.csv'
DELIMITER ',' DELIMITER ','
CSV HEADER; CSV HEADER;

View File

@ -8,13 +8,14 @@
"import pandas as pd\n", "import pandas as pd\n",
"from datetime import datetime as dt\n", "from datetime import datetime as dt\n",
"\n", "\n",
"import integrate as intr\n" "import integrate as intr\n",
"\n"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"end_time": "2023-12-03T12:17:02.386525Z", "end_time": "2023-12-03T15:47:10.110909Z",
"start_time": "2023-12-03T12:17:01.722469Z" "start_time": "2023-12-03T15:47:09.656556Z"
} }
}, },
"id": "be55b25929d95559" "id": "be55b25929d95559"
@ -27,38 +28,57 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"/Users/seb/Projects/repos/group-1/src/integrate.py:62: SettingWithCopyWarning: \n", "/Users/seb/Projects/repos/group-1/src/integrate.py:132: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n", "A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n", "Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n", "\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" miv_df_cols_dropped['Weekday_en'] = days.map(lambda x: weekday_names[x])\n" " miv_df_cols_dropped['Weekday_en'] = days.map(lambda x: weekday_names[x])\n",
"/Users/seb/Projects/repos/group-1/src/integrate.py:133: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" miv_df_cols_dropped['AnzFahrzeuge'] = miv_df_cols_dropped['AnzFahrzeuge'].fillna(0).astype(int)\n"
] ]
} }
], ],
"source": [ "source": [
"\n", "\n",
"miv_df = intr.process_miv_data()\n", "miv_df = intr.process_miv_data()\n",
"fb_data = intr.process_foot_bike_data()" "#fb_data = intr.process_foot_bike_data()"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"end_time": "2023-12-03T12:20:31.968179Z", "end_time": "2023-12-03T15:49:07.561603Z",
"start_time": "2023-12-03T12:17:04.199209Z" "start_time": "2023-12-03T15:47:14.759104Z"
} }
}, },
"id": "dd3831953afdeb72" "id": "dd3831953afdeb72"
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"121\n"
]
}
],
"source": [ "source": [
"test_df = miv_df\n" "duplicate_rows = miv_df[miv_df.duplicated()]\n",
"print(duplicate_rows.shape[0])"
], ],
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-03T15:51:21.158909Z",
"start_time": "2023-12-03T15:51:15.711222Z"
}
}, },
"id": "14471cd78389ce4d" "id": "14471cd78389ce4d"
}, },
@ -66,60 +86,11 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"outputs": [], "outputs": [],
"source": [
"test_df.dtypes\n",
"date_object = pd.to_datetime(test_df['Date'])\n"
],
"metadata": {
"collapsed": false
},
"id": "c70d21adef38fd68"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"test_df['Weekday_Name']"
],
"metadata": {
"collapsed": false
},
"id": "d0df3c0ef49e8061"
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"u_string RoadTrafficAccidentLocations.json\n",
"Filepath: datasets/RoadTrafficAccidentLocations.json\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/seb/Projects/repos/group-1/src/integrate.py:78: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" cleaned_acc_df.rename(columns={\n"
]
}
],
"source": [ "source": [
"acc_df = intr.process_accident_data(True)" "acc_df = intr.process_accident_data(True)"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false
"ExecuteTime": {
"end_time": "2023-12-03T12:20:47.066579Z",
"start_time": "2023-12-03T12:20:31.964275Z"
}
}, },
"id": "f86bc612060b17a4" "id": "f86bc612060b17a4"
}, },
@ -140,59 +111,8 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": null,
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accident Columns:\n",
"AccidentUID object\n",
"AccidentHour object\n",
"AccidentYear object\n",
"AccidentWeekDay_en object\n",
"AccidentType object\n",
"AccidentSeverityCategory object\n",
"AccidentInvolvingPedestrian object\n",
"AccidentInvolvingBicycle object\n",
"AccidentInvolvingMotorcycle object\n",
"RoadType object\n",
"RoadType_en object\n",
"EKoord object\n",
"NKoord object\n",
"AccidentMonth object\n",
"geometry geometry\n",
"dtype: object\n",
"\n",
"MIV Columns:\n",
"MSID object\n",
"ZSID object\n",
"Achse object\n",
"EKoord float64\n",
"NKoord float64\n",
"Richtung object\n",
"AnzFahrzeuge float64\n",
"AnzFahrzeugeStatus object\n",
"Date object\n",
"Hrs object\n",
"Weekday_en object\n",
"dtype: object\n",
"\n",
"FB Cols:\n",
"OST int64\n",
"NORD int64\n",
"DATE object\n",
"HRS object\n",
"VELO_IN float64\n",
"VELO_OUT float64\n",
"FUSS_IN float64\n",
"FUSS_OUT float64\n",
"Weekday_en object\n",
"ID int64\n",
"dtype: object\n"
]
}
],
"source": [ "source": [
"print(\"Accident Columns:\")\n", "print(\"Accident Columns:\")\n",
"print(acc_df.dtypes)\n", "print(acc_df.dtypes)\n",
@ -204,11 +124,7 @@
"print(fb_data.dtypes)" "print(fb_data.dtypes)"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false
"ExecuteTime": {
"end_time": "2023-12-03T12:20:47.067419Z",
"start_time": "2023-12-03T12:20:47.063397Z"
}
}, },
"id": "242041cd369d8454" "id": "242041cd369d8454"
}, },
@ -227,88 +143,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": null,
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"MIV unqiue: 0 2683009.89\n",
"1 2683009.89\n",
"2 2683009.89\n",
"3 2683009.89\n",
"4 2683009.89\n",
" ... \n",
"16699185 2682704.50\n",
"16699186 2682704.50\n",
"16699187 2682704.50\n",
"16699188 2682704.50\n",
"16699189 2682704.50\n",
"Name: EKoord, Length: 16699190, dtype: float64\n",
"Acc unique: <bound method Series.unique of 0 rt433\n",
"1 rt433\n",
"2 rt439\n",
"3 rt433\n",
"4 rt433\n",
" ... \n",
"55821 rt432\n",
"55822 rt433\n",
"55823 rt433\n",
"55824 rt433\n",
"55825 rt432\n",
"Name: RoadType, Length: 55826, dtype: object>\n",
"FB unique: 0 2012-01-01\n",
"1 2012-01-01\n",
"2 2012-01-01\n",
"3 2012-01-01\n",
"4 2012-01-01\n",
" ... \n",
"3011488 2019-07-13\n",
"3011489 2019-07-13\n",
"3011490 2019-07-13\n",
"3011491 2019-07-13\n",
"3011492 2019-07-13\n",
"Name: DATE, Length: 3011493, dtype: object\n"
]
}
],
"source": [ "source": [
"print(\"MIV unqiue:\", miv_df['EKoord'])\n", "print(\"MIV unqiue:\", miv_df['EKoord'])\n",
"print(\"Acc unique:\", acc_df['RoadType'].unique)\n", "print(\"Acc unique:\", acc_df['RoadType'].unique)\n",
"print(\"FB unique: \", fb_data['DATE'])\n" "print(\"FB unique: \", fb_data['DATE'])\n"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false
"ExecuteTime": {
"end_time": "2023-12-03T15:03:13.580284Z",
"start_time": "2023-12-03T15:03:13.574959Z"
}
}, },
"id": "f6d752ea17eda341" "id": "f6d752ea17eda341"
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": null,
"outputs": [ "outputs": [],
{
"data": {
"text/plain": " AccidentUID AccidentHour AccidentYear \\\n0 A2D2677533867004E0430A865E337004 00 2011 \n1 9FD6441F802C20A6E0430A865E3320A6 01 2011 \n2 9FDA0DC4856A6094E0430A865E336094 02 2011 \n3 A3B66E42396E6000E0430A865E336000 02 2011 \n4 9FDA0DBE8CCE9096E0430A865E339096 03 2011 \n\n AccidentWeekDay_en AccidentType AccidentSeverityCategory \\\n0 Saturday at0 as4 \n1 Saturday at0 as3 \n2 Saturday at0 as4 \n3 Saturday at5 as3 \n4 Saturday at0 as4 \n\n AccidentInvolvingPedestrian AccidentInvolvingBicycle \\\n0 false false \n1 false true \n2 false false \n3 false false \n4 false false \n\n AccidentInvolvingMotorcycle RoadType RoadType_en EKoord NKoord \\\n0 false rt433 Minor road 2684605 1245194 \n1 false rt433 Minor road 2682382 1246980 \n2 false rt439 Other 2682791 1247749 \n3 false rt433 Minor road 2681199 1247102 \n4 false rt433 Minor road 2682479 1250690 \n\n AccidentMonth geometry \n0 1 POINT Z (8.55841 47.35217 0.00000) \n1 1 POINT Z (8.52932 47.36851 0.00000) \n2 1 POINT Z (8.53488 47.37538 0.00000) \n3 1 POINT Z (8.51368 47.36976 0.00000) \n4 1 POINT Z (8.53129 47.40186 0.00000) ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>AccidentUID</th>\n <th>AccidentHour</th>\n <th>AccidentYear</th>\n <th>AccidentWeekDay_en</th>\n <th>AccidentType</th>\n <th>AccidentSeverityCategory</th>\n <th>AccidentInvolvingPedestrian</th>\n <th>AccidentInvolvingBicycle</th>\n <th>AccidentInvolvingMotorcycle</th>\n <th>RoadType</th>\n <th>RoadType_en</th>\n <th>EKoord</th>\n <th>NKoord</th>\n <th>AccidentMonth</th>\n <th>geometry</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>A2D2677533867004E0430A865E337004</td>\n <td>00</td>\n <td>2011</td>\n <td>Saturday</td>\n <td>at0</td>\n <td>as4</td>\n <td>false</td>\n <td>false</td>\n <td>false</td>\n <td>rt433</td>\n <td>Minor road</td>\n <td>2684605</td>\n <td>1245194</td>\n <td>1</td>\n <td>POINT Z (8.55841 47.35217 0.00000)</td>\n </tr>\n <tr>\n <th>1</th>\n <td>9FD6441F802C20A6E0430A865E3320A6</td>\n <td>01</td>\n <td>2011</td>\n <td>Saturday</td>\n <td>at0</td>\n <td>as3</td>\n <td>false</td>\n <td>true</td>\n <td>false</td>\n <td>rt433</td>\n <td>Minor road</td>\n <td>2682382</td>\n <td>1246980</td>\n <td>1</td>\n <td>POINT Z (8.52932 47.36851 0.00000)</td>\n </tr>\n <tr>\n <th>2</th>\n <td>9FDA0DC4856A6094E0430A865E336094</td>\n <td>02</td>\n <td>2011</td>\n <td>Saturday</td>\n <td>at0</td>\n <td>as4</td>\n <td>false</td>\n <td>false</td>\n <td>false</td>\n <td>rt439</td>\n <td>Other</td>\n <td>2682791</td>\n <td>1247749</td>\n <td>1</td>\n <td>POINT Z (8.53488 47.37538 0.00000)</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A3B66E42396E6000E0430A865E336000</td>\n <td>02</td>\n <td>2011</td>\n <td>Saturday</td>\n <td>at5</td>\n <td>as3</td>\n <td>false</td>\n <td>false</td>\n <td>false</td>\n <td>rt433</td>\n <td>Minor road</td>\n <td>2681199</td>\n <td>1247102</td>\n <td>1</td>\n <td>POINT Z (8.51368 47.36976 0.00000)</td>\n </tr>\n <tr>\n <th>4</th>\n <td>9FDA0DBE8CCE9096E0430A865E339096</td>\n <td>03</td>\n <td>2011</td>\n <td>Saturday</td>\n <td>at0</td>\n <td>as4</td>\n <td>false</td>\n <td>false</td>\n <td>false</td>\n <td>rt433</td>\n <td>Minor road</td>\n <td>2682479</td>\n <td>1250690</td>\n <td>1</td>\n <td>POINT Z (8.53129 47.40186 0.00000)</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"acc_df.head()" "acc_df.head()"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false
"ExecuteTime": {
"end_time": "2023-12-03T12:32:55.249260Z",
"start_time": "2023-12-03T12:32:55.235008Z"
}
}, },
"id": "a159cafa9c227b88" "id": "a159cafa9c227b88"
}, },