Data Integration Milestone Completed.

Refactored ensuring all relevant dirs exist into separate script.
This commit is contained in:
Sebastian Lenzlinger 2023-12-03 23:53:36 +01:00
parent e9b1d82517
commit 49bb3f4e20
4 changed files with 79 additions and 38 deletions

View File

@ -1,15 +1,14 @@
# Databases Project # Database Project Group 1
Use this repository for your integration code and any source code created while working on your project (ad-hoc code, ## Preliminaries
SQL queries, project files needed by external tools, etc.). * Ensure you have access to a running postgres instance
* Ensure you have ```python3``` and ```pip``` installed.
* From within the root of the project, run ```pip install -r requirements.txt```. This insures all python dependecies are met.
* In ```src/fill_db.py``` look for the ```db_info``` variable and adapt it to your credentials.
- Merge your code into the main branch on the due date. ## Action
- Do not commit datasets! In the following the order matters.
- Any other document (except for the dump in the final hand-in) should be handed-in via ADAM. 1. Run ```unsure_dirs_exist.py```. This makes sure all the directories needed to perform the data integration and logging exist.
1. Run ```integrate.py```. Adjust the main method to fit your needs. In particular adjust the ```process_all_data()``` method, such that the parameter corresponding to a dataset is ```False``` if the script shall download it form the internet, and ```True``` else. To get geojson data form signaled speed in to city of Zurich uncomment the line in the ``main`` method where you find ```load_tempo_geojson_from_api_to_local()```
If you have any questions regarding the project, please do not hesitate to ask during the exercise lessons or via mail 2. Run ```fill_db.py```. This will load the data into the database based on the credentials given in the ``db_info`` variable.
to [raphael.waltenspuel@unibas.ch](mailto:raphael.waltenspuel@unibas.ch)! 3. Perform Analysis.
It is recommended that you first create a ```.gitignore``` file. (And exclude the "datasets" folder, for example). A useful tool for creating ```.gitignore``` files is www.gitignore.io.
Feel free to update or replace this readme with a brief description of your project and goals.

26
src/ensure_dirs_exist.py Normal file
View File

@ -0,0 +1,26 @@
import logging
import os
"""
The functionality of this script has been adapted from data_utils.ensure_dirs_exist().
This needs to be run before any other script.
"""
data_dir = 'datasets/'
integrated_dir = 'datasets/integrated/'
logs_dir = 'logs/'
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('integrate.py')
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}')
logger.info("Ensuring needed directories exist.")
os.makedirs(data_dir, exist_ok=True)
logger.debug("data_dir created.")
os.makedirs(integrated_dir, exist_ok=True)
logger.debug("integrated_dir created")
os.makedirs(logs_dir, exist_ok=True)
logger.debug("logs_dir created")

View File

@ -12,9 +12,14 @@ logger.addHandler(stream_handler)
integrated_dir = 'datasets/integrated/' integrated_dir = 'datasets/integrated/'
accident_geojson_file = 'datasets/integrated/Accidents.geojson' accident_geojson_file = 'datasets/integrated/Accidents.geojson'
signaled_speeds_file = 'datasets/integrated/signaled_speeds.geojson.geojson'
accident_loader_script = 'load_accidents_into_db.sh' accident_loader_script = 'load_accidents_into_db.sh'
accident_table_name = 'accidents' accident_table_name = 'accidents'
signaled_speeds_table_name = 'signaled_speeds'
"""
Make sure db_info contain the correct credentials
"""
db_info = { db_info = {
'host': 'localhost', 'host': 'localhost',
'database': 'test-db23', 'database': 'test-db23',
@ -26,7 +31,6 @@ setup_tables_script = 'setup_tables.sql'
load_csvs_into_db_script = 'load_csvs_into_db.sql' load_csvs_into_db_script = 'load_csvs_into_db.sql'
def run_sql(script, db_info): def run_sql(script, db_info):
db_connection = psycopg2.connect(**db_info) db_connection = psycopg2.connect(**db_info)
db_cursor = db_connection.cursor() db_cursor = db_connection.cursor()
@ -47,7 +51,6 @@ def run_sql(script, db_info):
def run_geojson_loader_script(script, *args): def run_geojson_loader_script(script, *args):
try: try:
cmd = ['bash', script] + list(args) cmd = ['bash', script] + list(args)
res = subprocess.run(cmd, check=True, text=True, capture_output=True) res = subprocess.run(cmd, check=True, text=True, capture_output=True)
@ -57,7 +60,13 @@ def run_geojson_loader_script(script, *args):
logger.info(f"Remember to set the correct permissions for the script: chmod +x {script}") logger.info(f"Remember to set the correct permissions for the script: chmod +x {script}")
def geojson_loader(*args): def geojson_loader(*args, modus='append'):
"""
Use this instead of run_geojson_loader_script() in the main method to avoid the bash dependency.
:param args: All the arguments needed for ogr2org to run properly
:param modus: append or overwrite db table
:return:
"""
geojson_file, db_name, db_user, db_password, db_host, db_port, target_table = args geojson_file, db_name, db_user, db_password, db_host, db_port, target_table = args
cmd = [ cmd = [
"ogr2ogr", "ogr2ogr",
@ -65,7 +74,7 @@ def geojson_loader(*args):
f"PG:dbname='{db_name}' host='{db_host}' port='{db_port}' user='{db_user}' password='{db_password}'", f"PG:dbname='{db_name}' host='{db_host}' port='{db_port}' user='{db_user}' password='{db_password}'",
geojson_file, geojson_file,
"-nln", target_table, "-nln", target_table,
"-append" f"-{modus}"
] ]
try: try:
# Run the command # Run the command
@ -88,5 +97,12 @@ if __name__ == '__main__':
db_info['host'], db_info['host'],
db_info['port'], db_info['port'],
accident_table_name) accident_table_name)
logger.info('Finished loading geojson into db using bash script.') logger.info('Finished loading accident geojson into db using bash script.')
geojson_loader(signaled_speeds_file,
db_info['database'],
db_info['user'],
db_info['password'],
db_info['host'],
db_info['port'],
signaled_speeds_table_name,
modus='overwrite')

View File

@ -79,23 +79,21 @@ acc_data_types = {
'RoadType_en': 'str', 'RoadType_en': 'str',
'geometry': 'str' # TODO: Figure out what dtype this needs to be for postgres 'geometry': 'str' # TODO: Figure out what dtype this needs to be for postgres
} }
# def ensure_dirs_exist(data_dir, integrated_dir, logs_dir):
# """
def ensure_dirs_exist(data_dir, integrated_dir): # This should be called before anything else to make sure that the relevant directories exists.
""" # :param data_dir: directory where the datasets are stored
This should be called before anything else to make sure that the relevant directories exists. # :param integrated_dir: directory where the integrated data will be stored
:param data_dir: directory where the datasets are stored # :return:
:param integrated_dir: directory where the integrated data will be stored # """
:return: # logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}')
""" # logger.info("Ensuring needed directories exist.")
logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}') # os.makedirs(data_dir, exist_ok=True)
logger.info("Ensuring needed directories exist.") # logger.debug("data_dir created.")
os.makedirs(data_dir, exist_ok=True) # os.makedirs(integrated_dir, exist_ok=True)
logger.debug("data_dir created.") # logger.debug("integrated_dir created")
os.makedirs(integrated_dir, exist_ok=True) # os.makedirs(logs_dir, exist_ok=True)
logger.debug("integrated_dir created") # logger.debug("logs_dir created")
os.makedirs(logs_dir, exist_ok=True)
logger.debug("logs_dir created")
def process_foot_bike_data(files_present=True): def process_foot_bike_data(files_present=True):
@ -180,7 +178,7 @@ def process_all_data_sources(fb_present=True, miv_present=True, accident_present
:param accident_present: bool, if the files present in local file system :param accident_present: bool, if the files present in local file system
:return: :return:
""" """
ensure_dirs_exist(data_dir, integrated_dir) # ensure_dirs_exist(data_dir, integrated_dir)
logger.info("Started processing all data sources.") logger.info("Started processing all data sources.")
fb_to_integrated(fb_present) fb_to_integrated(fb_present)
@ -240,6 +238,8 @@ def load_tempo_geojson_from_api_to_local():
if __name__ == '__main__': if __name__ == '__main__':
process_all_data_sources(True, True, True) # ensure_dirs_exist(data_dir, integrated_dir, logs_dir)
# process_all_data_sources(True, True, True)
# miv_to_integrated_csv() # miv_to_integrated_csv()
# acc_to_cleaned_geojson() # acc_to_cleaned_geojson()
load_tempo_geojson_from_api_to_local()