edge-deployment

Deploying ML models to edge devices.

Safety Notice

This listing is imported from skills.sh public index metadata. Review upstream SKILL.md and repository scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "edge-deployment" with this command: npx skills add doanchienthangdev/omgkit/doanchienthangdev-omgkit-edge-deployment

Edge Deployment

Deploying ML models to edge devices.

Edge Deployment Landscape

┌─────────────────────────────────────────────────────────────┐ │ EDGE DEPLOYMENT TARGETS │ ├─────────────────────────────────────────────────────────────┤ │ │ │ MOBILE EMBEDDED IOT/SENSORS │ │ ────── ──────── ─────────── │ │ iOS (Core ML) Raspberry Pi Arduino/ESP32 │ │ Android (TFLite) NVIDIA Jetson Microcontrollers │ │ React Native Google Coral FPGA boards │ │ │ │ CONSTRAINTS: │ │ ├── Memory: 256MB - 8GB │ │ ├── Compute: CPU/GPU/NPU │ │ ├── Power: Battery/USB/Wall │ │ ├── Connectivity: Always/Sometimes/Never │ │ └── Latency: 1ms - 100ms │ │ │ └─────────────────────────────────────────────────────────────┘

TensorFlow Lite

Model Conversion

import tensorflow as tf

Basic conversion

converter = tf.lite.TFLiteConverter.from_saved_model('saved_model/') tflite_model = converter.convert()

With optimizations

converter.optimizations = [tf.lite.Optimize.DEFAULT]

Float16 quantization

converter.target_spec.supported_types = [tf.float16]

Full integer quantization

def representative_dataset(): for data in calibration_data: yield [data.astype(np.float32)]

converter.representative_dataset = representative_dataset converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] converter.inference_input_type = tf.int8 converter.inference_output_type = tf.int8

Save model

with open('model.tflite', 'wb') as f: f.write(converter.convert())

TFLite Inference

import numpy as np import tensorflow as tf

Load model

interpreter = tf.lite.Interpreter(model_path='model.tflite') interpreter.allocate_tensors()

Get input/output details

input_details = interpreter.get_input_details() output_details = interpreter.get_output_details()

Inference

def predict(input_data): interpreter.set_tensor(input_details[0]['index'], input_data) interpreter.invoke() return interpreter.get_tensor(output_details[0]['index'])

With delegate (GPU acceleration)

delegate = tf.lite.experimental.load_delegate('libedgetpu.so.1') interpreter = tf.lite.Interpreter( model_path='model_edgetpu.tflite', experimental_delegates=[delegate] )

Core ML (iOS)

PyTorch to Core ML

import coremltools as ct import torch

Export to Core ML

model.eval() example_input = torch.rand(1, 3, 224, 224) traced_model = torch.jit.trace(model, example_input)

mlmodel = ct.convert( traced_model, inputs=[ct.TensorType(shape=example_input.shape, name="image")], outputs=[ct.TensorType(name="predictions")], compute_precision=ct.precision.FLOAT16, minimum_deployment_target=ct.target.iOS15 )

Add metadata

mlmodel.author = "Your Name" mlmodel.short_description = "Image classifier" mlmodel.input_description["image"] = "Input image" mlmodel.output_description["predictions"] = "Class probabilities"

mlmodel.save("Model.mlpackage")

Swift Integration

import CoreML import Vision

class ModelInference { let model: VNCoreMLModel

init() throws {
    let config = MLModelConfiguration()
    config.computeUnits = .all  // Use Neural Engine
    let mlModel = try MyModel(configuration: config)
    self.model = try VNCoreMLModel(for: mlModel.model)
}

func predict(image: CGImage, completion: @escaping ([String: Double]) -> Void) {
    let request = VNCoreMLRequest(model: model) { request, error in
        guard let results = request.results as? [VNClassificationObservation] else { return }
        let predictions = Dictionary(
            uniqueKeysWithValues: results.prefix(5).map { ($0.identifier, Double($0.confidence)) }
        )
        completion(predictions)
    }

    let handler = VNImageRequestHandler(cgImage: image)
    try? handler.perform([request])
}

}

NVIDIA Jetson

TensorRT Optimization

import tensorrt as trt

def build_engine(onnx_path, engine_path, precision='fp16'): logger = trt.Logger(trt.Logger.INFO) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) ) parser = trt.OnnxParser(network, logger)

# Parse ONNX
with open(onnx_path, 'rb') as f:
    parser.parse(f.read())

# Build config
config = builder.create_builder_config()
config.max_workspace_size = 1 &#x3C;&#x3C; 30  # 1GB

if precision == 'fp16':
    config.set_flag(trt.BuilderFlag.FP16)
elif precision == 'int8':
    config.set_flag(trt.BuilderFlag.INT8)
    config.int8_calibrator = EntropyCalibrator(calibration_data)

# Build engine
engine = builder.build_engine(network, config)

with open(engine_path, 'wb') as f:
    f.write(engine.serialize())

return engine

Inference with TensorRT

class TRTInference: def init(self, engine_path): logger = trt.Logger(trt.Logger.WARNING) with open(engine_path, 'rb') as f: self.engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) self.context = self.engine.create_execution_context()

def infer(self, input_data):
    # Allocate buffers and run inference
    pass

DeepStream Pipeline

DeepStream config for video inference

config_infer_primary.txt

""" [property] gpu-id=0 net-scale-factor=0.0039215697906911373 model-file=resnet18.onnx model-engine-file=resnet18.engine labelfile-path=labels.txt batch-size=4 network-mode=2 # FP16 num-detected-classes=80 interval=0 gie-unique-id=1 process-mode=1 network-type=0 cluster-mode=2 maintain-aspect-ratio=1 symmetric-padding=1 """

Python pipeline

import gi gi.require_version('Gst', '1.0') from gi.repository import Gst

Gst.init(None) pipeline = Gst.parse_launch(""" filesrc location=video.mp4 ! decodebin ! nvvideoconvert ! nvinfer config-file-path=config.txt ! nvdsosd ! nvegltransform ! nveglglessink """) pipeline.set_state(Gst.State.PLAYING)

Microcontroller Deployment

TensorFlow Lite Micro

#include "tensorflow/lite/micro/all_ops_resolver.h" #include "tensorflow/lite/micro/micro_interpreter.h" #include "model_data.h"

// Allocate tensor arena constexpr int kTensorArenaSize = 10 * 1024; uint8_t tensor_arena[kTensorArenaSize];

void setup() { // Set up model const tflite::Model* model = tflite::GetModel(model_data);

// Set up resolver
tflite::AllOpsResolver resolver;

// Build interpreter
tflite::MicroInterpreter interpreter(
    model, resolver, tensor_arena, kTensorArenaSize
);
interpreter.AllocateTensors();

// Get input tensor
TfLiteTensor* input = interpreter.input(0);

// Fill input and invoke
// input->data.f[0] = sensor_value;
interpreter.Invoke();

// Get output
TfLiteTensor* output = interpreter.output(0);
float prediction = output->data.f[0];

}

Model Optimization for Edge

def optimize_for_edge(model, target_size_mb=10, target_latency_ms=50): """Optimize model for edge deployment.""" optimizations = []

# 1. Quantization
quantized = quantize_dynamic(model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8)
if get_size_mb(quantized) &#x3C;= target_size_mb:
    optimizations.append(('quantization', quantized))

# 2. Pruning
pruned = prune_model(model, amount=0.5)
if get_size_mb(pruned) &#x3C;= target_size_mb:
    optimizations.append(('pruning', pruned))

# 3. Knowledge distillation
student = create_smaller_model(model)
distilled = distill(teacher=model, student=student)
if get_size_mb(distilled) &#x3C;= target_size_mb:
    optimizations.append(('distillation', distilled))

# Evaluate each
results = []
for name, opt_model in optimizations:
    latency = measure_latency(opt_model)
    accuracy = evaluate(opt_model)
    if latency &#x3C;= target_latency_ms:
        results.append({
            'method': name,
            'latency': latency,
            'accuracy': accuracy,
            'size_mb': get_size_mb(opt_model)
        })

return sorted(results, key=lambda x: x['accuracy'], reverse=True)

Offline Inference

class OfflineInferenceManager: def init(self, model_path, cache_dir='./cache'): self.model = load_model(model_path) self.cache_dir = cache_dir self.pending_queue = []

def predict(self, input_data, priority='normal'):
    """Run inference locally."""
    return self.model(input_data)

def predict_with_fallback(self, input_data, cloud_endpoint=None):
    """Try cloud first, fall back to local."""
    try:
        if self._is_online() and cloud_endpoint:
            return self._cloud_predict(input_data, cloud_endpoint)
    except Exception:
        pass

    return self.predict(input_data)

def queue_for_sync(self, input_data, result):
    """Queue predictions for later sync."""
    self.pending_queue.append({
        'input': input_data,
        'result': result,
        'timestamp': time.time()
    })

def sync_when_online(self, endpoint):
    """Sync queued predictions when connectivity restored."""
    while self.pending_queue and self._is_online():
        item = self.pending_queue.pop(0)
        requests.post(endpoint, json=item)

Commands

  • /omgdeploy:edge

  • Edge deployment

  • /omgoptim:quantize

  • Quantization

  • /omgdeploy:package

  • Package for target

Best Practices

  • Profile on target device early

  • Use hardware-specific frameworks

  • Quantize to int8 when possible

  • Implement offline fallbacks

  • Monitor battery and thermal impact

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.

Coding

real-time-systems

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

building-nestjs-apis

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

managing-databases

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

database-optimization

No summary provided by upstream source.

Repository SourceNeeds Review