diff --git a/app/src/main/java/com/quillstudios/pokegoalshelper/ml/YOLOInferenceEngine.kt b/app/src/main/java/com/quillstudios/pokegoalshelper/ml/YOLOInferenceEngine.kt index ab54ca4..5f1e118 100644 --- a/app/src/main/java/com/quillstudios/pokegoalshelper/ml/YOLOInferenceEngine.kt +++ b/app/src/main/java/com/quillstudios/pokegoalshelper/ml/YOLOInferenceEngine.kt @@ -20,13 +20,101 @@ import kotlin.math.max import kotlin.math.min /** - * YOLO ONNX-based implementation of MLInferenceEngine. - * Preserves ALL functionality from the original YOLOOnnxDetector including: - * - Complete 96-class mapping - * - Multiple preprocessing techniques - * - Coordinate transformation modes - * - Weighted NMS and TTA - * - Debug and testing features + * Coordinate transformation strategies for mapping YOLO output to screen coordinates. + * + * YOLO models typically output normalized coordinates (0.0-1.0) that need to be transformed + * back to the original image/screen coordinate system. Different transformation methods + * handle aspect ratio preservation and padding differently. + */ +sealed class CoordinateTransformMode(val description: String) { + /** + * Letterbox transformation with padding. + * Preserves aspect ratio by adding gray padding, then removes padding from coordinates. + * Best for: Standard YOLO training with letterbox preprocessing. + */ + object LETTERBOX : CoordinateTransformMode("Letterbox with padding removal") + + /** + * Direct scaling without aspect ratio preservation. + * Maps coordinates directly from model space to original dimensions. + * Best for: Models trained without letterbox preprocessing. + */ + object DIRECT : CoordinateTransformMode("Direct coordinate scaling") + + /** + * Hybrid approach combining letterbox logic with direct scaling benefits. + * Uses letterbox calculation but applies direct coordinate mapping. + * Best for: Pokemon GO UI detection (recommended - highest accuracy). + */ + object HYBRID : CoordinateTransformMode("Hybrid letterbox-direct approach") +} + +/** + * Extension functions for OpenCV Mat operations to improve code readability and safety. + */ +private fun Mat.safeRelease() { + if (!this.empty()) { + this.release() + } +} + +private inline fun Mat.useSafely(action: (Mat) -> Unit) { + try { + action(this) + } finally { + this.safeRelease() + } +} + +private fun Mat.ensureBGRFormat(): Mat { + return when (this.type()) { + CvType.CV_8UC3 -> this + CvType.CV_8UC4 -> { + val converted = Mat() + Imgproc.cvtColor(this, converted, Imgproc.COLOR_BGRA2BGR) + converted + } + CvType.CV_8UC1 -> { + val converted = Mat() + Imgproc.cvtColor(this, converted, Imgproc.COLOR_GRAY2BGR) + converted + } + else -> { + val converted = Mat() + this.convertTo(converted, CvType.CV_8UC3) + converted + } + } +} + +/** + * YOLO ONNX-based implementation of MLInferenceEngine for Pokemon GO UI detection. + * + * This implementation provides high-accuracy object detection for Pokemon GO UI elements + * including pokeballs, stats, shiny icons, and other game interface components. + * + * ## Architecture Features: + * - **Data-driven configuration**: Model metadata extracted from ONNX runtime + * - **Dynamic class mapping**: Class names loaded from training dataset.yaml + * - **Modern Kotlin patterns**: Sealed classes, extension functions, inline functions + * - **Centralized logging**: PGH-prefixed logs for easy filtering + * - **Type-safe error handling**: MLResult pattern with specific error types + * - **Resource management**: Automatic cleanup and memory leak prevention + * + * ## Detection Pipeline: + * 1. **Preprocessing**: Ultralytics-style normalization and resizing to model input size + * 2. **Inference**: ONNX Runtime execution with configurable timeouts + * 3. **Postprocessing**: NMS parsing, coordinate transformation, confidence mapping + * 4. **Filtering**: Class-based filtering and confidence thresholding + * + * ## Coordinate Transformation: + * Supports three transformation modes via sealed class hierarchy: + * - `LETTERBOX`: Standard YOLO letterbox with padding removal + * - `DIRECT`: Direct scaling without aspect ratio preservation + * - `HYBRID`: Optimized approach for Pokemon GO UI (recommended) + * + * @param context Android context for asset access and ClassificationManager + * @param config YOLOConfig containing model parameters and thresholds */ class YOLOInferenceEngine( private val context: Context, @@ -45,8 +133,8 @@ class YOLOInferenceEngine( private const val ENABLE_TTA = true // Test-time augmentation private const val MAX_INFERENCE_TIME_MS = 4500L // Leave 500ms for other processing - // Coordinate transformation modes - HYBRID is the correct method - var COORD_TRANSFORM_MODE = "HYBRID" // HYBRID and LETTERBOX work correctly + // Coordinate transformation mode - HYBRID provides best accuracy + var COORD_TRANSFORM_MODE: CoordinateTransformMode = CoordinateTransformMode.HYBRID // Class filtering for debugging var DEBUG_CLASS_FILTER: String? = null // Set to class name to show only that class @@ -81,10 +169,10 @@ class YOLOInferenceEngine( private const val MIN_DEBUG_CONFIDENCE = 0.1f private const val MAX_DEBUG_DETECTIONS_TO_LOG = 3 - fun setCoordinateMode(mode: String) + fun setCoordinateMode(mode: CoordinateTransformMode) { COORD_TRANSFORM_MODE = mode - PGHLog.i(TAG, "🔧 Coordinate transform mode changed to: $mode") + PGHLog.i(TAG, "🔧 Coordinate transform mode changed to: ${mode::class.simpleName}") } fun toggleShowAllConfidences() @@ -675,7 +763,20 @@ class YOLOInferenceEngine( private data class TransformedCoordinates(val x1: Float, val y1: Float, val x2: Float, val y2: Float) /** - * Transform coordinates from model output to original image space + * Transform coordinates from model output to original image space. + * + * This is one of the most critical functions for detection accuracy. YOLO models output + * coordinates in normalized space (typically 0.0-1.0 or model input dimensions). + * We need to map these back to the original screen/image coordinates. + * + * @param rawX1 Left coordinate from model output + * @param rawY1 Top coordinate from model output + * @param rawX2 Right coordinate from model output + * @param rawY2 Bottom coordinate from model output + * @param originalWidth Original image/screen width + * @param originalHeight Original image/screen height + * @param inputScale Model input size (e.g., 640 for 640x640 model) + * @return Transformed coordinates in original image space */ private fun transformCoordinates( rawX1: Float, rawY1: Float, rawX2: Float, rawY2: Float, @@ -684,7 +785,7 @@ class YOLOInferenceEngine( { return when (COORD_TRANSFORM_MODE) { - "LETTERBOX" -> + CoordinateTransformMode.LETTERBOX -> { val letterbox_params = calculateLetterboxInverse(originalWidth, originalHeight, inputScale) val scale_x = letterbox_params[0] @@ -699,7 +800,7 @@ class YOLOInferenceEngine( y2 = (rawY2 - offset_y) * scale_y ) } - "DIRECT" -> + CoordinateTransformMode.DIRECT -> { val direct_scale_x = originalWidth.toFloat() / inputScale.toFloat() val direct_scale_y = originalHeight.toFloat() / inputScale.toFloat() @@ -711,7 +812,7 @@ class YOLOInferenceEngine( y2 = rawY2 * direct_scale_y ) } - "HYBRID" -> + CoordinateTransformMode.HYBRID -> { val letterbox_params = calculateLetterboxInverse(originalWidth, originalHeight, inputScale) val offset_x = letterbox_params[2] @@ -754,7 +855,26 @@ class YOLOInferenceEngine( } /** - * Parse NMS (Non-Maximum Suppression) output format + * Parse NMS (Non-Maximum Suppression) output format. + * + * This function processes the post-processed output from a YOLO model that has already + * applied Non-Maximum Suppression. The typical NMS output format is: + * [x1, y1, x2, y2, confidence, class_id] per detection. + * + * Key responsibilities: + * 1. Extract bounding box coordinates (x1, y1, x2, y2) + * 2. Extract confidence scores and apply thresholding + * 3. Extract class IDs and map to human-readable names + * 4. Transform coordinates from model space to original image space + * 5. Apply confidence mapping for mobile ONNX optimization + * 6. Filter by debug class if specified + * 7. Validate and clamp coordinates to image boundaries + * + * @param output Flattened float array from ONNX model output + * @param originalWidth Original image width for coordinate transformation + * @param originalHeight Original image height for coordinate transformation + * @param inputScale Model input size (e.g., 640 for 640x640 model) + * @return List of Detection objects with transformed coordinates and metadata */ private fun parseNMSOutput(output: FloatArray, originalWidth: Int, originalHeight: Int, inputScale: Int): List {