CSV Handler for Construction Data
Overview
CSV is the universal exchange format in construction - from scheduling exports to cost databases. This skill handles encoding issues, delimiter detection, and data cleaning.
Python Implementation
import pandas as pd import csv from typing import Dict, Any, List, Optional, Tuple from pathlib import Path from dataclasses import dataclass import chardet
@dataclass class CSVProfile: """Profile of CSV file.""" encoding: str delimiter: str has_header: bool row_count: int column_count: int columns: List[str]
class ConstructionCSVHandler: """Handle CSV files from construction software."""
COMMON_DELIMITERS = [',', ';', '\t', '|']
COMMON_ENCODINGS = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1']
def __init__(self):
self.last_profile: Optional[CSVProfile] = None
def detect_encoding(self, file_path: str) -> str:
"""Detect file encoding."""
with open(file_path, 'rb') as f:
raw = f.read(10000)
result = chardet.detect(raw)
return result.get('encoding', 'utf-8') or 'utf-8'
def detect_delimiter(self, file_path: str, encoding: str) -> str:
"""Detect CSV delimiter."""
with open(file_path, 'r', encoding=encoding, errors='replace') as f:
sample = f.read(5000)
# Count occurrences
counts = {d: sample.count(d) for d in self.COMMON_DELIMITERS}
# Return most common that appears consistently
if counts:
return max(counts, key=counts.get)
return ','
def profile_csv(self, file_path: str) -> CSVProfile:
"""Profile CSV file."""
encoding = self.detect_encoding(file_path)
delimiter = self.detect_delimiter(file_path, encoding)
# Read sample
df = pd.read_csv(file_path, encoding=encoding, delimiter=delimiter,
nrows=10, on_bad_lines='skip')
has_header = not df.columns[0].replace('.', '').replace('-', '').isdigit()
# Full row count
with open(file_path, 'r', encoding=encoding, errors='replace') as f:
row_count = sum(1 for _ in f) - (1 if has_header else 0)
profile = CSVProfile(
encoding=encoding,
delimiter=delimiter,
has_header=has_header,
row_count=row_count,
column_count=len(df.columns),
columns=list(df.columns)
)
self.last_profile = profile
return profile
def read_csv(self, file_path: str,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
clean: bool = True) -> pd.DataFrame:
"""Read CSV with auto-detection."""
# Auto-detect if not provided
if encoding is None:
encoding = self.detect_encoding(file_path)
if delimiter is None:
delimiter = self.detect_delimiter(file_path, encoding)
# Read with error handling
df = pd.read_csv(
file_path,
encoding=encoding,
delimiter=delimiter,
on_bad_lines='skip',
low_memory=False
)
if clean:
df = self.clean_dataframe(df)
return df
def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean construction CSV data."""
# Clean column names
df.columns = [self._clean_column_name(c) for c in df.columns]
# Remove empty rows and columns
df = df.dropna(how='all')
df = df.dropna(axis=1, how='all')
# Strip whitespace from strings
for col in df.select_dtypes(include=['object']):
df[col] = df[col].str.strip() if df[col].dtype == 'object' else df[col]
return df
def _clean_column_name(self, name: str) -> str:
"""Clean column name."""
if not isinstance(name, str):
return str(name)
# Remove special characters, replace spaces
clean = name.strip().lower()
clean = clean.replace(' ', '_').replace('-', '_')
clean = ''.join(c for c in clean if c.isalnum() or c == '_')
return clean
def merge_csvs(self, file_paths: List[str],
on_column: Optional[str] = None) -> pd.DataFrame:
"""Merge multiple CSV files."""
dfs = []
for path in file_paths:
df = self.read_csv(path)
df['_source_file'] = Path(path).name
dfs.append(df)
if not dfs:
return pd.DataFrame()
if on_column and on_column in dfs[0].columns:
result = dfs[0]
for df in dfs[1:]:
result = pd.merge(result, df, on=on_column, how='outer')
return result
return pd.concat(dfs, ignore_index=True)
def split_csv(self, df: pd.DataFrame,
group_column: str,
output_dir: str) -> List[str]:
"""Split CSV by column values."""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
files = []
for value in df[group_column].unique():
subset = df[df[group_column] == value]
filename = f"{group_column}_{value}.csv"
filepath = output_path / filename
subset.to_csv(filepath, index=False)
files.append(str(filepath))
return files
def convert_types(self, df: pd.DataFrame,
type_map: Dict[str, str] = None) -> pd.DataFrame:
"""Convert column types intelligently."""
df = df.copy()
if type_map:
for col, dtype in type_map.items():
if col in df.columns:
try:
df[col] = df[col].astype(dtype)
except:
pass
else:
# Auto-convert
for col in df.columns:
# Try numeric
try:
df[col] = pd.to_numeric(df[col])
continue
except:
pass
# Try datetime
try:
df[col] = pd.to_datetime(df[col])
except:
pass
return df
def export_csv(self, df: pd.DataFrame,
file_path: str,
encoding: str = 'utf-8-sig',
delimiter: str = ',') -> str:
"""Export DataFrame to CSV."""
df.to_csv(file_path, encoding=encoding, sep=delimiter, index=False)
return file_path
Specialized handlers
class ScheduleCSVHandler(ConstructionCSVHandler): """Handler for project schedule CSVs."""
SCHEDULE_COLUMNS = ['task_id', 'task_name', 'start_date', 'end_date',
'duration', 'predecessors', 'resources']
def parse_schedule(self, file_path: str) -> pd.DataFrame:
"""Parse schedule CSV."""
df = self.read_csv(file_path)
# Convert date columns
for col in df.columns:
if 'date' in col.lower() or 'start' in col.lower() or 'end' in col.lower():
try:
df[col] = pd.to_datetime(df[col])
except:
pass
return df
class CostCSVHandler(ConstructionCSVHandler): """Handler for cost/estimate CSVs."""
def parse_costs(self, file_path: str) -> pd.DataFrame:
"""Parse cost CSV."""
df = self.read_csv(file_path)
# Find and convert numeric columns
for col in df.columns:
if any(word in col.lower() for word in ['cost', 'price', 'amount', 'total', 'qty', 'quantity']):
df[col] = pd.to_numeric(df[col].replace(r'[\$,]', '', regex=True), errors='coerce')
return df
Quick Start
handler = ConstructionCSVHandler()
Profile CSV first
profile = handler.profile_csv("export.csv") print(f"Encoding: {profile.encoding}, Delimiter: '{profile.delimiter}'")
Read with auto-detection
df = handler.read_csv("export.csv") print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
Common Use Cases
- Merge Multiple Exports
files = ["jan_export.csv", "feb_export.csv", "mar_export.csv"] merged = handler.merge_csvs(files)
- Split by Category
handler.split_csv(df, group_column='category', output_dir='./split_files')
- Schedule Import
schedule_handler = ScheduleCSVHandler() schedule = schedule_handler.parse_schedule("p6_export.csv")
Resources
- DDC Book: Chapter 2.1 - Structured Data