45 lines
1.1 KiB
Python
45 lines
1.1 KiB
Python
import pandas as pd
|
|
import yaml
|
|
import os
|
|
|
|
|
|
def infer_general_dtype(dtype):
|
|
if pd.api.types.is_integer_dtype(dtype):
|
|
return "int"
|
|
elif pd.api.types.is_float_dtype(dtype):
|
|
return "float"
|
|
elif pd.api.types.is_datetime64_any_dtype(dtype):
|
|
return "datetime"
|
|
else:
|
|
return "string"
|
|
|
|
|
|
def load_schema(input_file):
|
|
ext = os.path.splitext(input_file)[1].lower()
|
|
|
|
if ext == ".csv":
|
|
df = pd.read_csv(input_file, nrows=100)
|
|
elif ext == ".parquet":
|
|
df = pd.read_parquet(input_file)
|
|
else:
|
|
raise ValueError(f"Unsupported file format: {ext}")
|
|
|
|
schema = {col: infer_general_dtype(dtype) for col, dtype in df.dtypes.items()}
|
|
return schema
|
|
|
|
|
|
def generate_combined_config(pbp_file, hk_file, output_file="config.yaml"):
|
|
config = {
|
|
"pbp_schema": load_schema(pbp_file),
|
|
"hk_schema": load_schema(hk_file),
|
|
}
|
|
|
|
with open(output_file, "w") as f:
|
|
yaml.dump(config, f, sort_keys=False)
|
|
|
|
print(f"Unified config saved to: {output_file}")
|
|
|
|
|
|
# Example usage
|
|
generate_combined_config("pbp_meta.parquet", "hk_meta.parquet")
|