Edge Deployment
Deploying ML models to edge devices.
Edge Deployment Landscape
┌─────────────────────────────────────────────────────────────┐ │ EDGE DEPLOYMENT TARGETS │ ├─────────────────────────────────────────────────────────────┤ │ │ │ MOBILE EMBEDDED IOT/SENSORS │ │ ────── ──────── ─────────── │ │ iOS (Core ML) Raspberry Pi Arduino/ESP32 │ │ Android (TFLite) NVIDIA Jetson Microcontrollers │ │ React Native Google Coral FPGA boards │ │ │ │ CONSTRAINTS: │ │ ├── Memory: 256MB - 8GB │ │ ├── Compute: CPU/GPU/NPU │ │ ├── Power: Battery/USB/Wall │ │ ├── Connectivity: Always/Sometimes/Never │ │ └── Latency: 1ms - 100ms │ │ │ └─────────────────────────────────────────────────────────────┘
TensorFlow Lite
Model Conversion
import tensorflow as tf
Basic conversion
converter = tf.lite.TFLiteConverter.from_saved_model('saved_model/') tflite_model = converter.convert()
With optimizations
converter.optimizations = [tf.lite.Optimize.DEFAULT]
Float16 quantization
converter.target_spec.supported_types = [tf.float16]
Full integer quantization
def representative_dataset(): for data in calibration_data: yield [data.astype(np.float32)]
converter.representative_dataset = representative_dataset converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] converter.inference_input_type = tf.int8 converter.inference_output_type = tf.int8
Save model
with open('model.tflite', 'wb') as f: f.write(converter.convert())
TFLite Inference
import numpy as np import tensorflow as tf
Load model
interpreter = tf.lite.Interpreter(model_path='model.tflite') interpreter.allocate_tensors()
Get input/output details
input_details = interpreter.get_input_details() output_details = interpreter.get_output_details()
Inference
def predict(input_data): interpreter.set_tensor(input_details[0]['index'], input_data) interpreter.invoke() return interpreter.get_tensor(output_details[0]['index'])
With delegate (GPU acceleration)
delegate = tf.lite.experimental.load_delegate('libedgetpu.so.1') interpreter = tf.lite.Interpreter( model_path='model_edgetpu.tflite', experimental_delegates=[delegate] )
Core ML (iOS)
PyTorch to Core ML
import coremltools as ct import torch
Export to Core ML
model.eval() example_input = torch.rand(1, 3, 224, 224) traced_model = torch.jit.trace(model, example_input)
mlmodel = ct.convert( traced_model, inputs=[ct.TensorType(shape=example_input.shape, name="image")], outputs=[ct.TensorType(name="predictions")], compute_precision=ct.precision.FLOAT16, minimum_deployment_target=ct.target.iOS15 )
Add metadata
mlmodel.author = "Your Name" mlmodel.short_description = "Image classifier" mlmodel.input_description["image"] = "Input image" mlmodel.output_description["predictions"] = "Class probabilities"
mlmodel.save("Model.mlpackage")
Swift Integration
import CoreML import Vision
class ModelInference { let model: VNCoreMLModel
init() throws {
let config = MLModelConfiguration()
config.computeUnits = .all // Use Neural Engine
let mlModel = try MyModel(configuration: config)
self.model = try VNCoreMLModel(for: mlModel.model)
}
func predict(image: CGImage, completion: @escaping ([String: Double]) -> Void) {
let request = VNCoreMLRequest(model: model) { request, error in
guard let results = request.results as? [VNClassificationObservation] else { return }
let predictions = Dictionary(
uniqueKeysWithValues: results.prefix(5).map { ($0.identifier, Double($0.confidence)) }
)
completion(predictions)
}
let handler = VNImageRequestHandler(cgImage: image)
try? handler.perform([request])
}
}
NVIDIA Jetson
TensorRT Optimization
import tensorrt as trt
def build_engine(onnx_path, engine_path, precision='fp16'): logger = trt.Logger(trt.Logger.INFO) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) ) parser = trt.OnnxParser(network, logger)
# Parse ONNX
with open(onnx_path, 'rb') as f:
parser.parse(f.read())
# Build config
config = builder.create_builder_config()
config.max_workspace_size = 1 << 30 # 1GB
if precision == 'fp16':
config.set_flag(trt.BuilderFlag.FP16)
elif precision == 'int8':
config.set_flag(trt.BuilderFlag.INT8)
config.int8_calibrator = EntropyCalibrator(calibration_data)
# Build engine
engine = builder.build_engine(network, config)
with open(engine_path, 'wb') as f:
f.write(engine.serialize())
return engine
Inference with TensorRT
class TRTInference: def init(self, engine_path): logger = trt.Logger(trt.Logger.WARNING) with open(engine_path, 'rb') as f: self.engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) self.context = self.engine.create_execution_context()
def infer(self, input_data):
# Allocate buffers and run inference
pass
DeepStream Pipeline
DeepStream config for video inference
config_infer_primary.txt
""" [property] gpu-id=0 net-scale-factor=0.0039215697906911373 model-file=resnet18.onnx model-engine-file=resnet18.engine labelfile-path=labels.txt batch-size=4 network-mode=2 # FP16 num-detected-classes=80 interval=0 gie-unique-id=1 process-mode=1 network-type=0 cluster-mode=2 maintain-aspect-ratio=1 symmetric-padding=1 """
Python pipeline
import gi gi.require_version('Gst', '1.0') from gi.repository import Gst
Gst.init(None) pipeline = Gst.parse_launch(""" filesrc location=video.mp4 ! decodebin ! nvvideoconvert ! nvinfer config-file-path=config.txt ! nvdsosd ! nvegltransform ! nveglglessink """) pipeline.set_state(Gst.State.PLAYING)
Microcontroller Deployment
TensorFlow Lite Micro
#include "tensorflow/lite/micro/all_ops_resolver.h" #include "tensorflow/lite/micro/micro_interpreter.h" #include "model_data.h"
// Allocate tensor arena constexpr int kTensorArenaSize = 10 * 1024; uint8_t tensor_arena[kTensorArenaSize];
void setup() { // Set up model const tflite::Model* model = tflite::GetModel(model_data);
// Set up resolver
tflite::AllOpsResolver resolver;
// Build interpreter
tflite::MicroInterpreter interpreter(
model, resolver, tensor_arena, kTensorArenaSize
);
interpreter.AllocateTensors();
// Get input tensor
TfLiteTensor* input = interpreter.input(0);
// Fill input and invoke
// input->data.f[0] = sensor_value;
interpreter.Invoke();
// Get output
TfLiteTensor* output = interpreter.output(0);
float prediction = output->data.f[0];
}
Model Optimization for Edge
def optimize_for_edge(model, target_size_mb=10, target_latency_ms=50): """Optimize model for edge deployment.""" optimizations = []
# 1. Quantization
quantized = quantize_dynamic(model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8)
if get_size_mb(quantized) <= target_size_mb:
optimizations.append(('quantization', quantized))
# 2. Pruning
pruned = prune_model(model, amount=0.5)
if get_size_mb(pruned) <= target_size_mb:
optimizations.append(('pruning', pruned))
# 3. Knowledge distillation
student = create_smaller_model(model)
distilled = distill(teacher=model, student=student)
if get_size_mb(distilled) <= target_size_mb:
optimizations.append(('distillation', distilled))
# Evaluate each
results = []
for name, opt_model in optimizations:
latency = measure_latency(opt_model)
accuracy = evaluate(opt_model)
if latency <= target_latency_ms:
results.append({
'method': name,
'latency': latency,
'accuracy': accuracy,
'size_mb': get_size_mb(opt_model)
})
return sorted(results, key=lambda x: x['accuracy'], reverse=True)
Offline Inference
class OfflineInferenceManager: def init(self, model_path, cache_dir='./cache'): self.model = load_model(model_path) self.cache_dir = cache_dir self.pending_queue = []
def predict(self, input_data, priority='normal'):
"""Run inference locally."""
return self.model(input_data)
def predict_with_fallback(self, input_data, cloud_endpoint=None):
"""Try cloud first, fall back to local."""
try:
if self._is_online() and cloud_endpoint:
return self._cloud_predict(input_data, cloud_endpoint)
except Exception:
pass
return self.predict(input_data)
def queue_for_sync(self, input_data, result):
"""Queue predictions for later sync."""
self.pending_queue.append({
'input': input_data,
'result': result,
'timestamp': time.time()
})
def sync_when_online(self, endpoint):
"""Sync queued predictions when connectivity restored."""
while self.pending_queue and self._is_online():
item = self.pending_queue.pop(0)
requests.post(endpoint, json=item)
Commands
-
/omgdeploy:edge
-
Edge deployment
-
/omgoptim:quantize
-
Quantization
-
/omgdeploy:package
-
Package for target
Best Practices
-
Profile on target device early
-
Use hardware-specific frameworks
-
Quantize to int8 when possible
-
Implement offline fallbacks
-
Monitor battery and thermal impact