group-1-db23/src/integrate.py

import data_utils as du
import os
import pandas as pd
import geopandas as gpd
import time
from shapely.geometry import Point
import re

import logging

logging.basicConfig(level=logging.DEBUG, filename='logs/integrate.log',
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('integrate.py')
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

foot_bike_urls_file = '../docs/foot_bike_zaehlung_urls.txt'
miv_file_urls = '../docs/verkehrszaehlung_moto_urls.txt'
accident_file_url = '../docs/accident_loc_urls.txt'

# Using u_string to discriminate between files that belong to each other
motor_file_u_string = 'sid_dav_verkehrszaehlung_miv_OD2031'
foot_bike_file_u_string = 'velo.csv'
accident_file_u_string = 'RoadTrafficAccidentLocations.json'

data_dir = 'datasets/'
integrated_dir = 'datasets/integrated/'
logs_dir = 'logs/'

signaled_speeds_json_api = 'https://www.ogd.stadt-zuerich.ch/wfs/geoportal/Signalisierte_Geschwindigkeiten?service=WFS&version=1.1.0&request=GetFeature&outputFormat=GeoJSON&typename=view_geoserver_tempo_ist'

weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

fb_data_types = {
    'ID': 'int',
    'NORD': 'int',
    'OST': 'int',
    'DATE': 'str',
    'HRS': 'int',
    'VELO_IN': 'int',
    'VELO_OUT': 'int',
    'FUSS_IN': 'int',
    'FUSS_OUT': 'int',
    'Weekday_en': 'str'
}

miv_data_types = {
    'ID': 'int',
    'MSID': 'str',
    'ZSID': 'str',
    'Achse': 'str',
    'NKoord': 'int',
    'EKoord': 'int',
    'Richtung': 'str',
    'AnzFahrzeuge': 'int',
    'AnzFahrzeugeStatus': 'str',
    'Datum': 'str',
    'Hrs': 'int',
    'Weekday_en': 'str',
}

acc_data_types = {
    'AccidentUID': 'str',
    'AccidentYear': 'int',
    'AccidentMonth': 'int',
    'AccidentWeekDay_en': 'str',
    'AccidentHour': 'int',
    'NKoord': 'int',
    'EKoord': 'int',
    'AccidentType_en': 'str',
    'AccidentType': 'str',
    'AccidentSeverityCategory': 'str',
    'AccidentInvolvingPedestrian': 'bool',
    'AccidentInvolvingBicycle': 'bool',
    'AccidentInvolvingMotorcycle': 'bool',
    'RoadType': 'str',
    'RoadType_en': 'str',
    'geometry': 'str' # TODO: Figure out what dtype this needs to be for postgres
}
# def ensure_dirs_exist(data_dir, integrated_dir, logs_dir):
#     """
#     This should be called before anything else to make sure that the relevant directories exists.
#     :param data_dir: directory where the datasets are stored
#     :param integrated_dir: directory where the integrated data will be stored
#     :return:
#     """
#     logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}')
#     logger.info("Ensuring needed directories exist.")
#     os.makedirs(data_dir, exist_ok=True)
#     logger.debug("data_dir created.")
#     os.makedirs(integrated_dir, exist_ok=True)
#     logger.debug("integrated_dir created")
#     os.makedirs(logs_dir, exist_ok=True)
#     logger.debug("logs_dir created")


def process_foot_bike_data(files_present=True):
    fb_df_unified = du.create_unified_df(foot_bike_urls_file, foot_bike_file_u_string, data_dir,
                                         files_present=files_present)
    fb_df_unified[['DATE', "TIME"]] = fb_df_unified['DATUM'].str.split('T', expand=True)
    fb_df_unified[['HRS', 'MINS']] = fb_df_unified['TIME'].str.split(':', expand=True)
    ## Evt brauchen wir doch FK_ZAEHLER
    fb_cols_to_drop = ['DATUM']
    fb_df_unified_correct_cols = fb_df_unified.drop(columns=fb_cols_to_drop, axis=1)
    fb_df_unified_correct_cols.fillna(0, inplace=True)
    fb_df_grouped = fb_df_unified_correct_cols.groupby(['OST', 'NORD', 'DATE', 'HRS']).agg({
        'VELO_IN': 'sum',
        'VELO_OUT': 'sum',
        'FUSS_IN': 'sum',
        'FUSS_OUT': 'sum'
    }).reset_index()
    dt_obj = pd.to_datetime(fb_df_grouped['DATE'])
    days = dt_obj.dt.weekday
    fb_df_grouped['Weekday_en'] = days.map(lambda x: weekday_names[x])
    cleaned_fb_df = fb_df_grouped
    cleaned_fb_df['ID'] = cleaned_fb_df.index + 1
    cleaned_fb_df = cleaned_fb_df[['ID', 'NORD', 'OST', 'DATE', 'HRS', 'VELO_IN', 'VELO_OUT', 'FUSS_IN',
                                   'FUSS_OUT', 'Weekday_en']]
    # Ensure datatype of df and sql table match
    cleaned_fb_df = cleaned_fb_df.astype(fb_data_types)
    return cleaned_fb_df


def process_miv_data(files_present=True):
    miv_df_unified = du.create_unified_df(miv_file_urls, motor_file_u_string, data_dir, files_present=files_present)

    miv_df_unified[['Datum', "Time"]] = miv_df_unified['MessungDatZeit'].str.split('T', expand=True)
    miv_df_unified[['Hrs', 'Mins', 'Sec']] = miv_df_unified['Time'].str.split(':', expand=True)

    miv_cols_to_keep = ['MSID','ZSID','Achse', 'NKoord', 'EKoord',  'Richtung', 'AnzFahrzeuge', 'AnzFahrzeugeStatus',
                        'Datum', 'Hrs',]
    miv_df_cols_dropped = miv_df_unified[miv_cols_to_keep]

    dt_obj = pd.to_datetime(miv_df_cols_dropped['Datum'])
    days = dt_obj.dt.weekday
    miv_df_cols_dropped.loc[:, 'Weekday_en'] = days.map(lambda x: weekday_names[x])

    miv_df_cols_dropped.loc[:, 'AnzFahrzeuge'] = miv_df_cols_dropped['AnzFahrzeuge'].fillna(0).astype(int)
    miv_df_cols_dropped[:, 'ZSID'] = miv_df_cols_dropped['ZSID'].fillna('Missing').astype(str)
    miv_df_cols_dropped['ID'] = (miv_df_cols_dropped.index + 1).copy()

    cleaned_miv_df = miv_df_cols_dropped[['ID', 'MSID', 'ZSID', 'Achse', 'NKoord', 'EKoord', 'Richtung', 'AnzFahrzeuge',
                                          'AnzFahrzeugeStatus', 'Datum', 'Hrs', 'Weekday_en']]

    cleaned_miv_df = cleaned_miv_df.astype(miv_data_types)
    cleaned_miv_df = cleaned_miv_df.drop_duplicates()
    return cleaned_miv_df


def process_accident_data(file_present: bool = True):
    if not file_present:
        du.process_urls(data_dir, accident_file_url)
    acc_df_unified = du.load_dataframes_from_geojson_files(data_dir, accident_file_u_string)
    acc_cols_to_keep = ['AccidentUID', 'AccidentYear', 'AccidentMonth', 'AccidentWeekDay_en','AccidentHour',
                        'AccidentLocation_CHLV95_N', 'AccidentLocation_CHLV95_E', 'AccidentType_en', 'AccidentType',
                        'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
                        'AccidentInvolvingMotorcycle', 'RoadType', 'RoadType_en',
                        'geometry']
    cleaned_acc_df = acc_df_unified[acc_cols_to_keep]
    cleaned_acc_df.rename(columns={
        'AccidentLocation_CHLV95_E': 'EKoord',
        'AccidentLocation_CHLV95_N': 'NKoord',
    }, inplace=True)

    cleaned_acc_df = cleaned_acc_df.astype(acc_data_types)
    return cleaned_acc_df


def process_all_data_sources(fb_present=True, miv_present=True, accident_present=True):
    """
    Process all data sources and turn them in to csv files. After this function is called there
    should be csv files of the cleaned and integrated data sources

    :param fb_present: bool, if the files present in local file system
    :param miv_present: bool, if the files present in local file system
    :param accident_present: bool, if the files present in local file system
    :return:
    """
    # ensure_dirs_exist(data_dir, integrated_dir)
    logger.info("Started processing all data sources.")
    fb_to_integrated(fb_present)

    miv_to_integrated_csv(miv_present)

    acc_to_cleaned_geojson(accident_present)


def fb_to_integrated(files_present=True):

    start_time = time.time()
    logger.info("Start processing pedestrian and bicycle data (FootBikeCount)")
    fb_count_df = process_foot_bike_data(files_present)
    logger.debug(f'FB Head:{fb_count_df.head()}\n FB dtypes: {fb_count_df.dtypes}')
    fb_file_path = os.path.join(integrated_dir, 'FootBikeCount.csv')
    logger.debug(f'FB Cleaned File Path: {fb_file_path}')
    fb_count_df.to_csv(fb_file_path, index=False)
    logger.info("FB integrated csv created.")
    end_time = time.time()
    logger.info(f'Time taken for FootBikeCount: {end_time-start_time}')


def miv_to_integrated_csv(miv_present=True):

    start_time2 = time.time()
    logger.info("Start processing motorized vehicle data (MivCount)")
    miv_count_df = process_miv_data(miv_present)
    logger.debug(f'MIV Head:{miv_count_df.head()}\n MIV dtypes: {miv_count_df.dtypes}')
    miv_file_path = os.path.join(integrated_dir, 'MivCount.csv')
    logger.debug(f'MIV Cleaned File Path: {miv_file_path}')
    miv_count_df.to_csv(miv_file_path, index=False)
    logger.info("MIV integrated csv created.")
    end_time = time.time()
    logger.info(f'Time taken for MivCount: {end_time-start_time2}')


def acc_to_cleaned_geojson(acc_present=True):
    start_time3 = time.time()
    logger.info("Start processing accident data (Accidents)")
    acc_df = process_accident_data(acc_present)
    logger.debug(f'ACC Head: { acc_df.head()}\n Acc dtypes: {acc_df.dtypes}')
    acc_file_path = os.path.join(integrated_dir, 'Accidents.geojson')
    logger.debug(f'Acc Cleaned file path: {acc_file_path}')
    acc_df['geometry'] = acc_df['geometry'].apply(lambda row: re.findall(r"[-+]?\d*\.\d+|\d+", row))
    # Create a Point object using the extracted coordinates
    acc_df['geometry'] = acc_df['geometry'].apply(
        lambda coords: Point(float(coords[0]), float(coords[1]), float(coords[2])))
    acc_gdf = gpd.GeoDataFrame(acc_df, geometry='geometry')
    acc_gdf.to_file(acc_file_path, driver='GeoJSON')
    logger.info("ACC integrated csv created.")
    end_time = time.time()
    logger.info(f'Time taken for Accidents: {end_time - start_time3}')


def load_tempo_geojson_from_api_to_local():
    du.load_file_from_api(signaled_speeds_json_api, 'signaled_speeds.geojson', integrated_dir)


if __name__ == '__main__':
    # ensure_dirs_exist(data_dir, integrated_dir, logs_dir)
    # process_all_data_sources(True, True, True)
    # miv_to_integrated_csv()
    # acc_to_cleaned_geojson()
    load_tempo_geojson_from_api_to_local()