Duration Prediction

Business Case

Problem Statement

Project duration estimation challenges:

Subjective expert estimates
Lack of historical benchmarking
Inaccurate early-stage predictions
Difficulty comparing similar projects

Solution

Machine learning-based duration prediction using k-Nearest Neighbors and regression models trained on historical project data.

Technical Implementation

import pandas as pd import numpy as np from typing import Dict, Any, List, Optional, Tuple from dataclasses import dataclass, field from datetime import date from enum import Enum import math

class ModelType(Enum): KNN = "knn" LINEAR_REGRESSION = "linear_regression" WEIGHTED_KNN = "weighted_knn"

class ProjectType(Enum): OFFICE = "office" RESIDENTIAL = "residential" INDUSTRIAL = "industrial" RETAIL = "retail" HEALTHCARE = "healthcare" EDUCATION = "education"

@dataclass class ProjectFeatures: project_id: str project_type: ProjectType size_sf: float floors: int complexity: int # 1-5 location_factor: float # Cost adjustment factor has_basement: bool = False is_renovation: bool = False actual_duration: Optional[int] = None # Days

@dataclass class PredictionResult: predicted_duration: int confidence_interval: Tuple[int, int] similar_projects: List[str] model_used: ModelType features_importance: Dict[str, float]

class DurationPredictor: """Predict project duration using ML techniques."""

def __init__(self):
    self.training_data: List[ProjectFeatures] = []
    self.feature_weights: Dict[str, float] = {
        'size_sf': 0.30,
        'floors': 0.15,
        'complexity': 0.25,
        'location_factor': 0.10,
        'has_basement': 0.10,
        'is_renovation': 0.10
    }
    self.type_baseline_days: Dict[ProjectType, Dict[str, float]] = {
        ProjectType.OFFICE: {'base': 300, 'per_1000sf': 0.5},
        ProjectType.RESIDENTIAL: {'base': 240, 'per_1000sf': 0.4},
        ProjectType.INDUSTRIAL: {'base': 180, 'per_1000sf': 0.3},
        ProjectType.RETAIL: {'base': 200, 'per_1000sf': 0.35},
        ProjectType.HEALTHCARE: {'base': 400, 'per_1000sf': 0.6},
        ProjectType.EDUCATION: {'base': 320, 'per_1000sf': 0.45}
    }

def add_training_project(self, project: ProjectFeatures):
    """Add project to training dataset."""
    if project.actual_duration is not None:
        self.training_data.append(project)

def load_training_data(self, df: pd.DataFrame):
    """Load training data from DataFrame."""

    for _, row in df.iterrows():
        project = ProjectFeatures(
            project_id=str(row['project_id']),
            project_type=ProjectType(row['project_type'].lower()),
            size_sf=float(row['size_sf']),
            floors=int(row['floors']),
            complexity=int(row['complexity']),
            location_factor=float(row.get('location_factor', 1.0)),
            has_basement=bool(row.get('has_basement', False)),
            is_renovation=bool(row.get('is_renovation', False)),
            actual_duration=int(row['actual_duration'])
        )
        self.add_training_project(project)

def _extract_features(self, project: ProjectFeatures) -> np.ndarray:
    """Extract feature vector from project."""

    return np.array([
        project.size_sf / 10000,  # Normalize to 10k SF
        project.floors,
        project.complexity,
        project.location_factor,
        1 if project.has_basement else 0,
        1 if project.is_renovation else 0
    ])

def _calculate_distance(self, features1: np.ndarray,
                        features2: np.ndarray) -> float:
    """Calculate weighted Euclidean distance."""

    weights = np.array(list(self.feature_weights.values()))
    diff = (features1 - features2) ** 2
    weighted_diff = diff * weights
    return math.sqrt(np.sum(weighted_diff))

def _find_k_nearest(self, target: ProjectFeatures, k: int = 5,
                    same_type: bool = True) -> List[Tuple[ProjectFeatures, float]]:
    """Find k nearest neighbors."""

    target_features = self._extract_features(target)
    distances = []

    for project in self.training_data:
        if same_type and project.project_type != target.project_type:
            continue

        proj_features = self._extract_features(project)
        distance = self._calculate_distance(target_features, proj_features)
        distances.append((project, distance))

    distances.sort(key=lambda x: x[1])
    return distances[:k]

def predict_knn(self, target: ProjectFeatures, k: int = 5) -> PredictionResult:
    """Predict duration using k-NN."""

    nearest = self._find_k_nearest(target, k)

    if not nearest:
        # Fall back to baseline
        return self._predict_baseline(target)

    # Simple average of k nearest
    durations = [p.actual_duration for p, _ in nearest]
    predicted = int(np.mean(durations))

    # Confidence interval (using std dev)
    std = np.std(durations)
    lower = int(predicted - 1.96 * std)
    upper = int(predicted + 1.96 * std)

    return PredictionResult(
        predicted_duration=predicted,
        confidence_interval=(max(1, lower), upper),
        similar_projects=[p.project_id for p, _ in nearest],
        model_used=ModelType.KNN,
        features_importance=self.feature_weights
    )

def predict_weighted_knn(self, target: ProjectFeatures, k: int = 5) -> PredictionResult:
    """Predict duration using distance-weighted k-NN."""

    nearest = self._find_k_nearest(target, k)

    if not nearest:
        return self._predict_baseline(target)

    # Inverse distance weighting
    total_weight = 0
    weighted_sum = 0

    for project, distance in nearest:
        weight = 1 / (distance + 0.001)  # Add small value to avoid division by zero
        weighted_sum += project.actual_duration * weight
        total_weight += weight

    predicted = int(weighted_sum / total_weight)

    # Confidence interval
    durations = [p.actual_duration for p, _ in nearest]
    std = np.std(durations)
    lower = int(predicted - 1.96 * std)
    upper = int(predicted + 1.96 * std)

    return PredictionResult(
        predicted_duration=predicted,
        confidence_interval=(max(1, lower), upper),
        similar_projects=[p.project_id for p, _ in nearest],
        model_used=ModelType.WEIGHTED_KNN,
        features_importance=self.feature_weights
    )

def predict_regression(self, target: ProjectFeatures) -> PredictionResult:
    """Predict duration using linear regression."""

    if len(self.training_data) &#x3C; 3:
        return self._predict_baseline(target)

    # Filter by project type
    same_type = [p for p in self.training_data if p.project_type == target.project_type]

    if len(same_type) &#x3C; 3:
        same_type = self.training_data

    # Build feature matrix and target vector
    X = np.array([self._extract_features(p) for p in same_type])
    y = np.array([p.actual_duration for p in same_type])

    # Simple linear regression using normal equations
    X_with_intercept = np.column_stack([np.ones(len(X)), X])

    try:
        # beta = (X'X)^-1 X'y
        XtX = X_with_intercept.T @ X_with_intercept
        XtX_inv = np.linalg.inv(XtX)
        beta = XtX_inv @ X_with_intercept.T @ y
    except np.linalg.LinAlgError:
        return self._predict_baseline(target)

    # Predict
    target_features = self._extract_features(target)
    target_with_intercept = np.array([1] + list(target_features))
    predicted = int(target_features @ beta[1:] + beta[0])

    # Calculate residuals for confidence interval
    y_pred = X_with_intercept @ beta
    residuals = y - y_pred
    rmse = math.sqrt(np.mean(residuals ** 2))

    return PredictionResult(
        predicted_duration=max(1, predicted),
        confidence_interval=(max(1, int(predicted - 1.96 * rmse)),
                           int(predicted + 1.96 * rmse)),
        similar_projects=[p.project_id for p in same_type[:5]],
        model_used=ModelType.LINEAR_REGRESSION,
        features_importance=dict(zip(self.feature_weights.keys(),
                                    [abs(b) / sum(abs(beta[1:])) for b in beta[1:]]))
    )

def _predict_baseline(self, target: ProjectFeatures) -> PredictionResult:
    """Fall back to baseline prediction."""

    baseline = self.type_baseline_days.get(target.project_type,
                                            {'base': 250, 'per_1000sf': 0.4})

    predicted = int(baseline['base'] +
                   (target.size_sf / 1000) * baseline['per_1000sf'] * 30)

    # Adjustments
    if target.complexity > 3:
        predicted = int(predicted * (1 + (target.complexity - 3) * 0.1))
    if target.has_basement:
        predicted = int(predicted * 1.1)
    if target.is_renovation:
        predicted = int(predicted * 1.2)

    predicted = int(predicted * target.location_factor)

    return PredictionResult(
        predicted_duration=predicted,
        confidence_interval=(int(predicted * 0.8), int(predicted * 1.2)),
        similar_projects=[],
        model_used=ModelType.LINEAR_REGRESSION,
        features_importance=self.feature_weights
    )

def predict(self, target: ProjectFeatures,
            model: ModelType = ModelType.WEIGHTED_KNN,
            k: int = 5) -> PredictionResult:
    """Predict duration using specified model."""

    if model == ModelType.KNN:
        return self.predict_knn(target, k)
    elif model == ModelType.WEIGHTED_KNN:
        return self.predict_weighted_knn(target, k)
    elif model == ModelType.LINEAR_REGRESSION:
        return self.predict_regression(target)

    return self._predict_baseline(target)

def evaluate_model(self, test_data: List[ProjectFeatures],
                   model: ModelType = ModelType.WEIGHTED_KNN) -> Dict[str, float]:
    """Evaluate model performance."""

    actuals = []
    predictions = []

    for project in test_data:
        if project.actual_duration is None:
            continue

        result = self.predict(project, model)
        actuals.append(project.actual_duration)
        predictions.append(result.predicted_duration)

    if not actuals:
        return {}

    actuals = np.array(actuals)
    predictions = np.array(predictions)

    mae = np.mean(np.abs(actuals - predictions))
    mape = np.mean(np.abs((actuals - predictions) / actuals)) * 100
    rmse = math.sqrt(np.mean((actuals - predictions) ** 2))

    return {
        'mae': round(mae, 1),
        'mape': round(mape, 1),
        'rmse': round(rmse, 1),
        'samples': len(actuals)
    }

def get_similar_projects(self, target: ProjectFeatures, n: int = 10) -> pd.DataFrame:
    """Get most similar projects."""

    nearest = self._find_k_nearest(target, k=n, same_type=False)

    data = [{
        'Project ID': p.project_id,
        'Type': p.project_type.value,
        'Size (SF)': p.size_sf,
        'Floors': p.floors,
        'Complexity': p.complexity,
        'Duration (days)': p.actual_duration,
        'Distance': round(d, 3)
    } for p, d in nearest]

    return pd.DataFrame(data)

Quick Start

Create predictor

predictor = DurationPredictor()

Add training data

training_projects = [ ProjectFeatures("P001", ProjectType.OFFICE, 50000, 10, 3, 1.0, True, False, 365), ProjectFeatures("P002", ProjectType.OFFICE, 75000, 15, 4, 1.1, True, False, 450), ProjectFeatures("P003", ProjectType.OFFICE, 30000, 5, 2, 0.9, False, False, 280), ProjectFeatures("P004", ProjectType.OFFICE, 60000, 12, 3, 1.0, True, False, 390), ]

for p in training_projects: predictor.add_training_project(p)

Predict for new project

new_project = ProjectFeatures( project_id="NEW-001", project_type=ProjectType.OFFICE, size_sf=55000, floors=11, complexity=3, location_factor=1.0, has_basement=True, is_renovation=False )

result = predictor.predict(new_project, ModelType.WEIGHTED_KNN) print(f"Predicted duration: {result.predicted_duration} days") print(f"Confidence interval: {result.confidence_interval}") print(f"Similar projects: {result.similar_projects}")

Common Use Cases

Compare Models

knn_result = predictor.predict(new_project, ModelType.KNN) weighted_result = predictor.predict(new_project, ModelType.WEIGHTED_KNN) regression_result = predictor.predict(new_project, ModelType.LINEAR_REGRESSION)

print(f"k-NN: {knn_result.predicted_duration}") print(f"Weighted k-NN: {weighted_result.predicted_duration}") print(f"Regression: {regression_result.predicted_duration}")

Model Evaluation

metrics = predictor.evaluate_model(test_data, ModelType.WEIGHTED_KNN) print(f"MAE: {metrics['mae']} days") print(f"MAPE: {metrics['mape']}%")

Find Similar Projects

similar = predictor.get_similar_projects(new_project, n=5) print(similar)

Resources

DDC Book: Chapter 4.5 - Future: Predictions and Machine Learning
scikit-learn: https://scikit-learn.org/
Website: https://datadrivenconstruction.io

duration-prediction

Safety Notice

Copy this and send it to your AI assistant to learn

Create predictor

Add training data

Predict for new project

Source Transparency

Related Skills

cad-to-data

drawing-analyzer

dwg-to-excel

cost-estimation-resource