Data Integration Milestone Completed.
Refactored ensuring all relevant dirs exist into separate script.
This commit is contained in:
parent
e9b1d82517
commit
49bb3f4e20
25
README.md
25
README.md
@ -1,15 +1,14 @@
|
|||||||
# Databases Project
|
# Database Project Group 1
|
||||||
|
|
||||||
Use this repository for your integration code and any source code created while working on your project (ad-hoc code,
|
## Preliminaries
|
||||||
SQL queries, project files needed by external tools, etc.).
|
* Ensure you have access to a running postgres instance
|
||||||
|
* Ensure you have ```python3``` and ```pip``` installed.
|
||||||
|
* From within the root of the project, run ```pip install -r requirements.txt```. This insures all python dependecies are met.
|
||||||
|
* In ```src/fill_db.py``` look for the ```db_info``` variable and adapt it to your credentials.
|
||||||
|
|
||||||
- Merge your code into the main branch on the due date.
|
## Action
|
||||||
- Do not commit datasets!
|
In the following the order matters.
|
||||||
- Any other document (except for the dump in the final hand-in) should be handed-in via ADAM.
|
1. Run ```unsure_dirs_exist.py```. This makes sure all the directories needed to perform the data integration and logging exist.
|
||||||
|
1. Run ```integrate.py```. Adjust the main method to fit your needs. In particular adjust the ```process_all_data()``` method, such that the parameter corresponding to a dataset is ```False``` if the script shall download it form the internet, and ```True``` else. To get geojson data form signaled speed in to city of Zurich uncomment the line in the ``main`` method where you find ```load_tempo_geojson_from_api_to_local()```
|
||||||
If you have any questions regarding the project, please do not hesitate to ask during the exercise lessons or via mail
|
2. Run ```fill_db.py```. This will load the data into the database based on the credentials given in the ``db_info`` variable.
|
||||||
to [raphael.waltenspuel@unibas.ch](mailto:raphael.waltenspuel@unibas.ch)!
|
3. Perform Analysis.
|
||||||
|
|
||||||
It is recommended that you first create a ```.gitignore``` file. (And exclude the "datasets" folder, for example). A useful tool for creating ```.gitignore``` files is www.gitignore.io.
|
|
||||||
|
|
||||||
Feel free to update or replace this readme with a brief description of your project and goals.
|
|
||||||
26
src/ensure_dirs_exist.py
Normal file
26
src/ensure_dirs_exist.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
"""
|
||||||
|
The functionality of this script has been adapted from data_utils.ensure_dirs_exist().
|
||||||
|
This needs to be run before any other script.
|
||||||
|
"""
|
||||||
|
data_dir = 'datasets/'
|
||||||
|
integrated_dir = 'datasets/integrated/'
|
||||||
|
logs_dir = 'logs/'
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger('integrate.py')
|
||||||
|
stream_handler = logging.StreamHandler()
|
||||||
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
stream_handler.setFormatter(formatter)
|
||||||
|
logger.addHandler(stream_handler)
|
||||||
|
|
||||||
|
logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}')
|
||||||
|
logger.info("Ensuring needed directories exist.")
|
||||||
|
os.makedirs(data_dir, exist_ok=True)
|
||||||
|
logger.debug("data_dir created.")
|
||||||
|
os.makedirs(integrated_dir, exist_ok=True)
|
||||||
|
logger.debug("integrated_dir created")
|
||||||
|
os.makedirs(logs_dir, exist_ok=True)
|
||||||
|
logger.debug("logs_dir created")
|
||||||
@ -12,9 +12,14 @@ logger.addHandler(stream_handler)
|
|||||||
|
|
||||||
integrated_dir = 'datasets/integrated/'
|
integrated_dir = 'datasets/integrated/'
|
||||||
accident_geojson_file = 'datasets/integrated/Accidents.geojson'
|
accident_geojson_file = 'datasets/integrated/Accidents.geojson'
|
||||||
|
signaled_speeds_file = 'datasets/integrated/signaled_speeds.geojson.geojson'
|
||||||
accident_loader_script = 'load_accidents_into_db.sh'
|
accident_loader_script = 'load_accidents_into_db.sh'
|
||||||
accident_table_name = 'accidents'
|
accident_table_name = 'accidents'
|
||||||
|
signaled_speeds_table_name = 'signaled_speeds'
|
||||||
|
|
||||||
|
"""
|
||||||
|
Make sure db_info contain the correct credentials
|
||||||
|
"""
|
||||||
db_info = {
|
db_info = {
|
||||||
'host': 'localhost',
|
'host': 'localhost',
|
||||||
'database': 'test-db23',
|
'database': 'test-db23',
|
||||||
@ -26,7 +31,6 @@ setup_tables_script = 'setup_tables.sql'
|
|||||||
load_csvs_into_db_script = 'load_csvs_into_db.sql'
|
load_csvs_into_db_script = 'load_csvs_into_db.sql'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def run_sql(script, db_info):
|
def run_sql(script, db_info):
|
||||||
db_connection = psycopg2.connect(**db_info)
|
db_connection = psycopg2.connect(**db_info)
|
||||||
db_cursor = db_connection.cursor()
|
db_cursor = db_connection.cursor()
|
||||||
@ -47,7 +51,6 @@ def run_sql(script, db_info):
|
|||||||
|
|
||||||
|
|
||||||
def run_geojson_loader_script(script, *args):
|
def run_geojson_loader_script(script, *args):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cmd = ['bash', script] + list(args)
|
cmd = ['bash', script] + list(args)
|
||||||
res = subprocess.run(cmd, check=True, text=True, capture_output=True)
|
res = subprocess.run(cmd, check=True, text=True, capture_output=True)
|
||||||
@ -57,7 +60,13 @@ def run_geojson_loader_script(script, *args):
|
|||||||
logger.info(f"Remember to set the correct permissions for the script: chmod +x {script}")
|
logger.info(f"Remember to set the correct permissions for the script: chmod +x {script}")
|
||||||
|
|
||||||
|
|
||||||
def geojson_loader(*args):
|
def geojson_loader(*args, modus='append'):
|
||||||
|
"""
|
||||||
|
Use this instead of run_geojson_loader_script() in the main method to avoid the bash dependency.
|
||||||
|
:param args: All the arguments needed for ogr2org to run properly
|
||||||
|
:param modus: append or overwrite db table
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
geojson_file, db_name, db_user, db_password, db_host, db_port, target_table = args
|
geojson_file, db_name, db_user, db_password, db_host, db_port, target_table = args
|
||||||
cmd = [
|
cmd = [
|
||||||
"ogr2ogr",
|
"ogr2ogr",
|
||||||
@ -65,7 +74,7 @@ def geojson_loader(*args):
|
|||||||
f"PG:dbname='{db_name}' host='{db_host}' port='{db_port}' user='{db_user}' password='{db_password}'",
|
f"PG:dbname='{db_name}' host='{db_host}' port='{db_port}' user='{db_user}' password='{db_password}'",
|
||||||
geojson_file,
|
geojson_file,
|
||||||
"-nln", target_table,
|
"-nln", target_table,
|
||||||
"-append"
|
f"-{modus}"
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
# Run the command
|
# Run the command
|
||||||
@ -88,5 +97,12 @@ if __name__ == '__main__':
|
|||||||
db_info['host'],
|
db_info['host'],
|
||||||
db_info['port'],
|
db_info['port'],
|
||||||
accident_table_name)
|
accident_table_name)
|
||||||
logger.info('Finished loading geojson into db using bash script.')
|
logger.info('Finished loading accident geojson into db using bash script.')
|
||||||
|
geojson_loader(signaled_speeds_file,
|
||||||
|
db_info['database'],
|
||||||
|
db_info['user'],
|
||||||
|
db_info['password'],
|
||||||
|
db_info['host'],
|
||||||
|
db_info['port'],
|
||||||
|
signaled_speeds_table_name,
|
||||||
|
modus='overwrite')
|
||||||
|
|||||||
@ -79,23 +79,21 @@ acc_data_types = {
|
|||||||
'RoadType_en': 'str',
|
'RoadType_en': 'str',
|
||||||
'geometry': 'str' # TODO: Figure out what dtype this needs to be for postgres
|
'geometry': 'str' # TODO: Figure out what dtype this needs to be for postgres
|
||||||
}
|
}
|
||||||
|
# def ensure_dirs_exist(data_dir, integrated_dir, logs_dir):
|
||||||
|
# """
|
||||||
def ensure_dirs_exist(data_dir, integrated_dir):
|
# This should be called before anything else to make sure that the relevant directories exists.
|
||||||
"""
|
# :param data_dir: directory where the datasets are stored
|
||||||
This should be called before anything else to make sure that the relevant directories exists.
|
# :param integrated_dir: directory where the integrated data will be stored
|
||||||
:param data_dir: directory where the datasets are stored
|
# :return:
|
||||||
:param integrated_dir: directory where the integrated data will be stored
|
# """
|
||||||
:return:
|
# logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}')
|
||||||
"""
|
# logger.info("Ensuring needed directories exist.")
|
||||||
logger.debug(f'data_dir: {data_dir}\n integrated_dir: {integrated_dir}')
|
# os.makedirs(data_dir, exist_ok=True)
|
||||||
logger.info("Ensuring needed directories exist.")
|
# logger.debug("data_dir created.")
|
||||||
os.makedirs(data_dir, exist_ok=True)
|
# os.makedirs(integrated_dir, exist_ok=True)
|
||||||
logger.debug("data_dir created.")
|
# logger.debug("integrated_dir created")
|
||||||
os.makedirs(integrated_dir, exist_ok=True)
|
# os.makedirs(logs_dir, exist_ok=True)
|
||||||
logger.debug("integrated_dir created")
|
# logger.debug("logs_dir created")
|
||||||
os.makedirs(logs_dir, exist_ok=True)
|
|
||||||
logger.debug("logs_dir created")
|
|
||||||
|
|
||||||
|
|
||||||
def process_foot_bike_data(files_present=True):
|
def process_foot_bike_data(files_present=True):
|
||||||
@ -180,7 +178,7 @@ def process_all_data_sources(fb_present=True, miv_present=True, accident_present
|
|||||||
:param accident_present: bool, if the files present in local file system
|
:param accident_present: bool, if the files present in local file system
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
ensure_dirs_exist(data_dir, integrated_dir)
|
# ensure_dirs_exist(data_dir, integrated_dir)
|
||||||
logger.info("Started processing all data sources.")
|
logger.info("Started processing all data sources.")
|
||||||
fb_to_integrated(fb_present)
|
fb_to_integrated(fb_present)
|
||||||
|
|
||||||
@ -240,6 +238,8 @@ def load_tempo_geojson_from_api_to_local():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
process_all_data_sources(True, True, True)
|
# ensure_dirs_exist(data_dir, integrated_dir, logs_dir)
|
||||||
|
# process_all_data_sources(True, True, True)
|
||||||
# miv_to_integrated_csv()
|
# miv_to_integrated_csv()
|
||||||
# acc_to_cleaned_geojson()
|
# acc_to_cleaned_geojson()
|
||||||
|
load_tempo_geojson_from_api_to_local()
|
||||||
|
|||||||
Reference in New Issue
Block a user