group-1-db23/src/data_utils.py

# data_utils.py

import os
import pandas as pd
import requests
from urllib.parse import urlparse
import geopandas as gpd
from concurrent.futures import ThreadPoolExecutor as tpe


def download_csv(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)


def process_urls(data_dir, urls_file):
    # Ensure the data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # Read URLs from the file
    with open(urls_file, 'r') as file:
        urls = file.readlines()

    # Process each URL
    for url in urls:
        url = url.strip()
        filename = os.path.basename(urlparse(url).path)
        local_filename = os.path.join(data_dir, filename)

        # Check if the file already exists
        if not os.path.isfile(local_filename):
            print(f"Downloading {url}...")
            download_csv(url, local_filename)
            print(f"Saved to {local_filename}")
        else:
            print(f"File {filename} already exists in {data_dir}, skipping download.")


def load_dataframe_from_csv(filepath):
    try:
        df = pd.read_csv(filepath, low_memory=False)
        return df
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None


def load_dataframes_from_csv_files(data_dir, u_string):
    dataframes = []

    with tpe(max_workers=5) as executor:
        for filename in os.listdir(data_dir):
            if (u_string in filename) and filename.endswith('.csv'):
                filepath = os.path.join(data_dir, filename)
                future = executor.submit(load_dataframe_from_csv, filepath)
                dataframes.append(future)

    dataframes = [future.result() for future in dataframes if future.result() is not None]

    return dataframes

    # for filename in os.listdir(data_dir):
    #     if (u_string in filename) and filename.endswith('.csv'):
    #         filepath = os.path.join(data_dir, filename)
    #         df = pd.read_csv(filepath, low_memory=False)
    #         dataframes.append(df)
    # return dataframes


def load_dataframes_from_geojson_files(data_dir, u_string):
    print('u_string', u_string)
    gdf = gpd.GeoDataFrame()
    for filename in os.listdir(data_dir):
        print("Filename:", filename)
        if (u_string in filename) and filename.endswith('.json'):
            filepath = os.path.join(data_dir, filename)
            print("Filepath:", filepath)
            gdf = gpd.read_file(filepath)  # Read GeoJSON directly as GeoDataFrame

    return gdf


def combine_dataframes(dataframes):
    if dataframes:
        combined_dataframe = pd.concat(dataframes, ignore_index=True)
        return combined_dataframe
    else:
        print("No dataframes to combine")
        return pd.DataFrame()  # Return an empty DataFrame


def create_unified_df(urls_file, u_string, data_dir, files_present=False):
    df_list = []
    df_unified = None
    if not files_present:
        process_urls(data_dir, urls_file)

    df_list = load_dataframes_from_csv_files(data_dir, u_string)
    df_unified = combine_dataframes(df_list)

    return df_unified


def save_dataframe_to_csv(df, integrated_dir, filename):
    pass


if __name__ == "__main__":
    # Test the functions here if necessary
    csv_urls_file = '../docs/all_csv_urls.txt'
    datasets_dir = 'datasets/'
    output_file = 'column_names.txt'
    process_urls(datasets_dir, csv_urls_file)
    # extract_column_names(datasets_dir, output_file)