diff --git a/README.md b/README.md index 9c9703d..bae52c9 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,14 @@ -# Databases Project +# Database Project Group 1 -Use this repository for your integration code and any source code created while working on your project (ad-hoc code, -SQL queries, project files needed by external tools, etc.). +## Preliminaries +* Ensure you have access to a running postgres instance +* Ensure you have ```python3``` and ```pip``` installed. +* From within the root of the project, run ```pip install -r requirements.txt```. This insures all python dependecies are met. +* In ```src/fill_db.py``` look for the ```db_info``` variable and adapt it to your credentials. -- Merge your code into the main branch on the due date. -- Do not commit datasets! -- Any other document (except for the dump in the final hand-in) should be handed-in via ADAM. - -If you have any questions regarding the project, please do not hesitate to ask during the exercise lessons or via mail -to [raphael.waltenspuel@unibas.ch](mailto:raphael.waltenspuel@unibas.ch)! - -It is recommended that you first create a ```.gitignore``` file. (And exclude the "datasets" folder, for example). A useful tool for creating ```.gitignore``` files is www.gitignore.io. - -Feel free to update or replace this readme with a brief description of your project and goals. \ No newline at end of file +## Action +In the following the order matters. +1. Run ```unsure_dirs_exist.py```. This makes sure all the directories needed to perform the data integration and logging exist. +1. Run ```integrate.py```. Adjust the main method to fit your needs. In particular adjust the ```process_all_data()``` method, such that the parameter corresponding to a dataset is ```False``` if the script shall download it form the internet, and ```True``` else. To get geojson data form signaled speed in to city of Zurich uncomment the line in the ``main`` method where you find ```load_tempo_geojson_from_api_to_local()``` +2. Run ```fill_db.py```. This will load the data into the database based on the credentials given in the ``db_info`` variable. +3. Perform Analysis. \ No newline at end of file diff --git a/src/ensure_dirs_exist.py b/src/ensure_dirs_exist.py new file mode 100644 index 0000000..2ac3d57 --- /dev/null +++ b/src/ensure_dirs_exist.py @@ -0,0 +1,26 @@ +import logging +import os +""" +The functionality of this script has been adapted from data_utils.ensure_dirs_exist(). +This needs to be run before any other script. +""" +data_dir = 'datasets/' +integrated_dir = 'datasets/integrated/' +logs_dir = 'logs/' + +logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger('integrate.py') +stream_handler = logging.StreamHandler() +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) + +logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}') +logger.info("Ensuring needed directories exist.") +os.makedirs(data_dir, exist_ok=True) +logger.debug("data_dir created.") +os.makedirs(integrated_dir, exist_ok=True) +logger.debug("integrated_dir created") +os.makedirs(logs_dir, exist_ok=True) +logger.debug("logs_dir created") diff --git a/src/fill_db.py b/src/fill_db.py index 041dad8..003b451 100644 --- a/src/fill_db.py +++ b/src/fill_db.py @@ -12,9 +12,14 @@ logger.addHandler(stream_handler) integrated_dir = 'datasets/integrated/' accident_geojson_file = 'datasets/integrated/Accidents.geojson' +signaled_speeds_file = 'datasets/integrated/signaled_speeds.geojson.geojson' accident_loader_script = 'load_accidents_into_db.sh' accident_table_name = 'accidents' +signaled_speeds_table_name = 'signaled_speeds' +""" +Make sure db_info contain the correct credentials +""" db_info = { 'host': 'localhost', 'database': 'test-db23', @@ -26,7 +31,6 @@ setup_tables_script = 'setup_tables.sql' load_csvs_into_db_script = 'load_csvs_into_db.sql' - def run_sql(script, db_info): db_connection = psycopg2.connect(**db_info) db_cursor = db_connection.cursor() @@ -47,7 +51,6 @@ def run_sql(script, db_info): def run_geojson_loader_script(script, *args): - try: cmd = ['bash', script] + list(args) res = subprocess.run(cmd, check=True, text=True, capture_output=True) @@ -57,7 +60,13 @@ def run_geojson_loader_script(script, *args): logger.info(f"Remember to set the correct permissions for the script: chmod +x {script}") -def geojson_loader(*args): +def geojson_loader(*args, modus='append'): + """ + Use this instead of run_geojson_loader_script() in the main method to avoid the bash dependency. + :param args: All the arguments needed for ogr2org to run properly + :param modus: append or overwrite db table + :return: + """ geojson_file, db_name, db_user, db_password, db_host, db_port, target_table = args cmd = [ "ogr2ogr", @@ -65,7 +74,7 @@ def geojson_loader(*args): f"PG:dbname='{db_name}' host='{db_host}' port='{db_port}' user='{db_user}' password='{db_password}'", geojson_file, "-nln", target_table, - "-append" + f"-{modus}" ] try: # Run the command @@ -88,5 +97,12 @@ if __name__ == '__main__': db_info['host'], db_info['port'], accident_table_name) - logger.info('Finished loading geojson into db using bash script.') - + logger.info('Finished loading accident geojson into db using bash script.') + geojson_loader(signaled_speeds_file, + db_info['database'], + db_info['user'], + db_info['password'], + db_info['host'], + db_info['port'], + signaled_speeds_table_name, + modus='overwrite') diff --git a/src/integrate.py b/src/integrate.py index 31c2d02..d37d174 100644 --- a/src/integrate.py +++ b/src/integrate.py @@ -79,23 +79,21 @@ acc_data_types = { 'RoadType_en': 'str', 'geometry': 'str' # TODO: Figure out what dtype this needs to be for postgres } - - -def ensure_dirs_exist(data_dir, integrated_dir): - """ - This should be called before anything else to make sure that the relevant directories exists. - :param data_dir: directory where the datasets are stored - :param integrated_dir: directory where the integrated data will be stored - :return: - """ - logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}') - logger.info("Ensuring needed directories exist.") - os.makedirs(data_dir, exist_ok=True) - logger.debug("data_dir created.") - os.makedirs(integrated_dir, exist_ok=True) - logger.debug("integrated_dir created") - os.makedirs(logs_dir, exist_ok=True) - logger.debug("logs_dir created") +# def ensure_dirs_exist(data_dir, integrated_dir, logs_dir): +# """ +# This should be called before anything else to make sure that the relevant directories exists. +# :param data_dir: directory where the datasets are stored +# :param integrated_dir: directory where the integrated data will be stored +# :return: +# """ +# logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}') +# logger.info("Ensuring needed directories exist.") +# os.makedirs(data_dir, exist_ok=True) +# logger.debug("data_dir created.") +# os.makedirs(integrated_dir, exist_ok=True) +# logger.debug("integrated_dir created") +# os.makedirs(logs_dir, exist_ok=True) +# logger.debug("logs_dir created") def process_foot_bike_data(files_present=True): @@ -180,7 +178,7 @@ def process_all_data_sources(fb_present=True, miv_present=True, accident_present :param accident_present: bool, if the files present in local file system :return: """ - ensure_dirs_exist(data_dir, integrated_dir) + # ensure_dirs_exist(data_dir, integrated_dir) logger.info("Started processing all data sources.") fb_to_integrated(fb_present) @@ -240,6 +238,8 @@ def load_tempo_geojson_from_api_to_local(): if __name__ == '__main__': - process_all_data_sources(True, True, True) + # ensure_dirs_exist(data_dir, integrated_dir, logs_dir) + # process_all_data_sources(True, True, True) # miv_to_integrated_csv() # acc_to_cleaned_geojson() + load_tempo_geojson_from_api_to_local()