csv-wrangler

Patterns for handling real-world messy CSV files.

Safety Notice

This listing is imported from skills.sh public index metadata. Review upstream SKILL.md and repository scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "csv-wrangler" with this command: npx skills add majesticlabs-dev/majestic-marketplace/majesticlabs-dev-majestic-marketplace-csv-wrangler

CSV-Wrangler

Patterns for handling real-world messy CSV files.

Encoding Detection

import chardet

def detect_encoding(file_path: str, sample_size: int = 10000) -> str: """Detect file encoding from sample.""" with open(file_path, 'rb') as f: raw = f.read(sample_size) result = chardet.detect(raw) return result['encoding']

def read_with_encoding(path: str) -> pd.DataFrame: """Read CSV with auto-detected encoding.""" encoding = detect_encoding(path) # Common fallback chain encodings = [encoding, 'utf-8', 'latin-1', 'cp1252']

for enc in encodings:
    try:
        return pd.read_csv(path, encoding=enc)
    except UnicodeDecodeError:
        continue

# Last resort: ignore errors
return pd.read_csv(path, encoding='utf-8', errors='ignore')

Delimiter Detection

import csv

def detect_delimiter(file_path: str) -> str: """Detect CSV delimiter from file sample.""" with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: sample = f.read(4096)

sniffer = csv.Sniffer()
try:
    dialect = sniffer.sniff(sample, delimiters=',;\t|')
    return dialect.delimiter
except csv.Error:
    # Count occurrences and pick most common
    counts = {d: sample.count(d) for d in [',', ';', '\t', '|']}
    return max(counts, key=counts.get)

def read_with_delimiter_detection(path: str) -> pd.DataFrame: """Read CSV with auto-detected delimiter.""" delimiter = detect_delimiter(path) return pd.read_csv(path, sep=delimiter)

Handling Malformed Rows

def read_with_error_handling(path: str) -> tuple[pd.DataFrame, list[dict]]: """Read CSV, capturing malformed rows separately.""" good_rows = [] bad_rows = []

with open(path, 'r', encoding='utf-8', errors='replace') as f:
    reader = csv.reader(f)
    header = next(reader)
    expected_cols = len(header)

    for line_num, row in enumerate(reader, start=2):
        if len(row) == expected_cols:
            good_rows.append(row)
        else:
            bad_rows.append({
                'line': line_num,
                'expected': expected_cols,
                'actual': len(row),
                'data': row
            })

df = pd.DataFrame(good_rows, columns=header)
return df, bad_rows

Using pandas error_bad_lines (deprecated) / on_bad_lines

def read_skip_bad_lines(path: str) -> pd.DataFrame: """Read CSV, skipping malformed rows.""" return pd.read_csv( path, on_bad_lines='skip', # or 'warn' to see them engine='python' )

Quoted Fields with Embedded Delimiters

def read_quoted_csv(path: str) -> pd.DataFrame: """Handle CSVs with quoted fields containing delimiters.""" return pd.read_csv( path, quotechar='"', doublequote=True, # Handle "" as escaped quote escapechar='\', # Handle , as escaped comma engine='python' )

For really messy files

def read_regex_separated(path: str, pattern: str = r',(?=(?:[^"]"[^"]")[^"]$)') -> pd.DataFrame: """Split on delimiter only outside quotes using regex.""" import re rows = [] with open(path, 'r') as f: for line in f: rows.append(re.split(pattern, line.strip())) return pd.DataFrame(rows[1:], columns=rows[0])

Header Detection

def detect_header_row(path: str, max_rows: int = 10) -> int: """Find the header row in files with metadata at top.""" with open(path, 'r') as f: lines = [f.readline() for _ in range(max_rows)]

delimiter = detect_delimiter(path)

for i, line in enumerate(lines):
    parts = line.split(delimiter)
    # Header likely has more columns than metadata
    # and contains text-like values
    if len(parts) > 2 and all(p.strip().replace('_', '').isalnum() for p in parts[:3]):
        return i

return 0  # Default to first row

def read_with_header_detection(path: str) -> pd.DataFrame: """Read CSV, auto-detecting header row.""" header_row = detect_header_row(path) return pd.read_csv(path, skiprows=header_row)

Date Parsing

def infer_date_columns(df: pd.DataFrame) -> list[str]: """Identify columns that look like dates.""" date_cols = [] date_patterns = [ r'\d{4}-\d{2}-\d{2}', # 2024-01-15 r'\d{2}/\d{2}/\d{4}', # 01/15/2024 r'\d{2}-\d{2}-\d{4}', # 15-01-2024 r'\d{4}/\d{2}/\d{2}', # 2024/01/15 ]

for col in df.columns:
    if df[col].dtype == 'object':
        sample = df[col].dropna().head(100)
        for pattern in date_patterns:
            if sample.str.match(pattern).mean() > 0.8:
                date_cols.append(col)
                break

return date_cols

def parse_dates_flexibly(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame: """Parse dates with multiple format attempts.""" for col in columns: try: df[col] = pd.to_datetime(df[col], infer_datetime_format=True) except: # Try common formats explicitly for fmt in ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%Y/%m/%d']: try: df[col] = pd.to_datetime(df[col], format=fmt) break except: continue return df

Numeric Cleaning

def clean_numeric_column(series: pd.Series) -> pd.Series: """Clean numeric column with currency symbols, commas, etc.""" if series.dtype == 'object': cleaned = (series .astype(str) .str.replace(r'[$€£¥,]', '', regex=True) # Remove currency/commas .str.replace(r'\s+', '', regex=True) # Remove whitespace .str.replace(r'((.+))', r'-\1', regex=True) # (123) -> -123 ) return pd.to_numeric(cleaned, errors='coerce') return series

def clean_all_numeric(df: pd.DataFrame) -> pd.DataFrame: """Attempt to convert object columns to numeric.""" for col in df.select_dtypes(include=['object']).columns: converted = pd.to_numeric(df[col], errors='coerce') # Keep conversion if >80% success if converted.notna().mean() > 0.8: df[col] = converted return df

Complete Wrangling Pipeline

def wrangle_csv(path: str) -> tuple[pd.DataFrame, dict]: """Full pipeline for messy CSV handling.""" report = {'issues': [], 'fixes': []}

# Step 1: Detect encoding
encoding = detect_encoding(path)
report['encoding'] = encoding

# Step 2: Detect delimiter
delimiter = detect_delimiter(path)
report['delimiter'] = delimiter

# Step 3: Detect header row
header_row = detect_header_row(path)
if header_row > 0:
    report['fixes'].append(f"Skipped {header_row} metadata rows")

# Step 4: Read with error handling
try:
    df = pd.read_csv(
        path,
        encoding=encoding,
        sep=delimiter,
        skiprows=header_row,
        on_bad_lines='warn',
        engine='python'
    )
except Exception as e:
    report['issues'].append(f"Read error: {e}")
    df = pd.read_csv(path, encoding='latin-1', on_bad_lines='skip')

# Step 5: Clean column names
df.columns = (df.columns
    .str.strip()
    .str.lower()
    .str.replace(r'\s+', '_', regex=True)
    .str.replace(r'[^\w]', '', regex=True)
)
report['fixes'].append("Normalized column names")

# Step 6: Parse dates
date_cols = infer_date_columns(df)
if date_cols:
    df = parse_dates_flexibly(df, date_cols)
    report['fixes'].append(f"Parsed dates: {date_cols}")

# Step 7: Clean numeric columns
df = clean_all_numeric(df)

report['shape'] = df.shape
report['dtypes'] = df.dtypes.to_dict()

return df, report

Excel File Handling

def read_excel_smart(path: str, sheet: str | int = 0) -> pd.DataFrame: """Read Excel with common cleanup.""" df = pd.read_excel( path, sheet_name=sheet, engine='openpyxl', # For .xlsx # engine='xlrd', # For .xls )

# Drop fully empty rows/columns
df = df.dropna(how='all').dropna(axis=1, how='all')

# Reset index after dropping
df = df.reset_index(drop=True)

return df

def list_excel_sheets(path: str) -> list[str]: """List all sheets in an Excel file.""" xl = pd.ExcelFile(path) return xl.sheet_names

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.

Coding

google-ads-strategy

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

viral-content

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

market-research

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

free-tool-arsenal

No summary provided by upstream source.

Repository SourceNeeds Review