This repository has been archived on 2025-01-04. You can view files and clone it, but cannot push or open issues or pull requests.
group-1-db23/src/data_utils.py
Sebastian Lenzlinger 77bf140efc Add Text File containing relevant datasource urls.
First diary entries written.
Wiki entries on how to setup a python virtual env for the project
2023-12-02 22:10:30 +01:00

119 lines
3.5 KiB
Python

# data_utils.py
import os
import pandas as pd
import requests
from urllib.parse import urlparse
import geopandas as gpd
from concurrent.futures import ThreadPoolExecutor as tpe
def download_csv(url, local_filename):
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
def process_urls(data_dir, urls_file):
# Ensure the data directory exists
if not os.path.exists(data_dir):
os.makedirs(data_dir)
# Read URLs from the file
with open(urls_file, 'r') as file:
urls = file.readlines()
# Process each URL
for url in urls:
url = url.strip()
filename = os.path.basename(urlparse(url).path)
local_filename = os.path.join(data_dir, filename)
# Check if the file already exists
if not os.path.isfile(local_filename):
print(f"Downloading {url}...")
download_csv(url, local_filename)
print(f"Saved to {local_filename}")
else:
print(f"File {filename} already exists in {data_dir}, skipping download.")
def load_dataframe_from_csv(filepath):
try:
df = pd.read_csv(filepath, low_memory=False)
return df
except Exception as e:
print(f"Error loading {filepath}: {e}")
return None
def load_dataframes_from_csv_files(data_dir, u_string):
dataframes = []
with tpe(max_workers=5) as executor:
for filename in os.listdir(data_dir):
if (u_string in filename) and filename.endswith('.csv'):
filepath = os.path.join(data_dir, filename)
future = executor.submit(load_dataframe_from_csv, filepath)
dataframes.append(future)
dataframes = [future.result() for future in dataframes if future.result() is not None]
return dataframes
# for filename in os.listdir(data_dir):
# if (u_string in filename) and filename.endswith('.csv'):
# filepath = os.path.join(data_dir, filename)
# df = pd.read_csv(filepath, low_memory=False)
# dataframes.append(df)
# return dataframes
def load_dataframes_from_geojson_files(data_dir, u_string):
print('u_string', u_string)
gdf = gpd.GeoDataFrame()
for filename in os.listdir(data_dir):
print("Filename:", filename)
if (u_string in filename) and filename.endswith('.json'):
filepath = os.path.join(data_dir, filename)
print("Filepath:", filepath)
gdf = gpd.read_file(filepath) # Read GeoJSON directly as GeoDataFrame
return gdf
def combine_dataframes(dataframes):
if dataframes:
combined_dataframe = pd.concat(dataframes, ignore_index=True)
return combined_dataframe
else:
print("No dataframes to combine")
return pd.DataFrame() # Return an empty DataFrame
def create_unified_df(urls_file, u_string, data_dir, files_present=False):
df_list = []
df_unified = None
if not files_present:
process_urls(data_dir, urls_file)
df_list = load_dataframes_from_csv_files(data_dir, u_string)
df_unified = combine_dataframes(df_list)
return df_unified
def save_dataframe_to_csv(df, integrated_dir, filename):
pass
if __name__ == "__main__":
# Test the functions here if necessary
csv_urls_file = '../docs/all_csv_urls.txt'
datasets_dir = 'datasets/'
output_file = 'column_names.txt'
process_urls(datasets_dir, csv_urls_file)
# extract_column_names(datasets_dir, output_file)