@ -20,13 +20,101 @@ import kotlin.math.max
import kotlin.math.min
/ * *
* YOLO ONNX - based implementation of MLInferenceEngine .
* Preserves ALL functionality from the original YOLOOnnxDetector including :
* - Complete 96 - class mapping
* - Multiple preprocessing techniques
* - Coordinate transformation modes
* - Weighted NMS and TTA
* - Debug and testing features
* Coordinate transformation strategies for mapping YOLO output to screen coordinates .
*
* YOLO models typically output normalized coordinates ( 0.0 - 1.0 ) that need to be transformed
* back to the original image / screen coordinate system . Different transformation methods
* handle aspect ratio preservation and padding differently .
* /
sealed class CoordinateTransformMode ( val description : String ) {
/ * *
* Letterbox transformation with padding .
* Preserves aspect ratio by adding gray padding , then removes padding from coordinates .
* Best for : Standard YOLO training with letterbox preprocessing .
* /
object LETTERBOX : CoordinateTransformMode ( " Letterbox with padding removal " )
/ * *
* Direct scaling without aspect ratio preservation .
* Maps coordinates directly from model space to original dimensions .
* Best for : Models trained without letterbox preprocessing .
* /
object DIRECT : CoordinateTransformMode ( " Direct coordinate scaling " )
/ * *
* Hybrid approach combining letterbox logic with direct scaling benefits .
* Uses letterbox calculation but applies direct coordinate mapping .
* Best for : Pokemon GO UI detection ( recommended - highest accuracy ) .
* /
object HYBRID : CoordinateTransformMode ( " Hybrid letterbox-direct approach " )
}
/ * *
* Extension functions for OpenCV Mat operations to improve code readability and safety .
* /
private fun Mat . safeRelease ( ) {
if ( ! this . empty ( ) ) {
this . release ( )
}
}
private inline fun Mat . useSafely ( action : ( Mat ) -> Unit ) {
try {
action ( this )
} finally {
this . safeRelease ( )
}
}
private fun Mat . ensureBGRFormat ( ) : Mat {
return when ( this . type ( ) ) {
CvType . CV_8UC3 -> this
CvType . CV_8UC4 -> {
val converted = Mat ( )
Imgproc . cvtColor ( this , converted , Imgproc . COLOR_BGRA2BGR )
converted
}
CvType . CV_8UC1 -> {
val converted = Mat ( )
Imgproc . cvtColor ( this , converted , Imgproc . COLOR_GRAY2BGR )
converted
}
else -> {
val converted = Mat ( )
this . convertTo ( converted , CvType . CV_8UC3 )
converted
}
}
}
/ * *
* YOLO ONNX - based implementation of MLInferenceEngine for Pokemon GO UI detection .
*
* This implementation provides high - accuracy object detection for Pokemon GO UI elements
* including pokeballs , stats , shiny icons , and other game interface components .
*
* # # Architecture Features :
* - * * Data - driven configuration * * : Model metadata extracted from ONNX runtime
* - * * Dynamic class mapping * * : Class names loaded from training dataset . yaml
* - * * Modern Kotlin patterns * * : Sealed classes , extension functions , inline functions
* - * * Centralized logging * * : PGH - prefixed logs for easy filtering
* - * * Type - safe error handling * * : MLResult pattern with specific error types
* - * * Resource management * * : Automatic cleanup and memory leak prevention
*
* # # Detection Pipeline :
* 1. * * Preprocessing * * : Ultralytics - style normalization and resizing to model input size
* 2. * * Inference * * : ONNX Runtime execution with configurable timeouts
* 3. * * Postprocessing * * : NMS parsing , coordinate transformation , confidence mapping
* 4. * * Filtering * * : Class - based filtering and confidence thresholding
*
* # # Coordinate Transformation :
* Supports three transformation modes via sealed class hierarchy :
* - `LETTERBOX ` : Standard YOLO letterbox with padding removal
* - `DIRECT ` : Direct scaling without aspect ratio preservation
* - `HYBRID ` : Optimized approach for Pokemon GO UI ( recommended )
*
* @param context Android context for asset access and ClassificationManager
* @param config YOLOConfig containing model parameters and thresholds
* /
class YOLOInferenceEngine (
private val context : Context ,
@ -45,8 +133,8 @@ class YOLOInferenceEngine(
private const val ENABLE_TTA = true // Test-time augmentation
private const val MAX_INFERENCE_TIME_MS = 4500L // Leave 500ms for other processing
// Coordinate transformation modes - HYBRID is the correct method
var COORD_TRANSFORM_MODE = " HYBRID " // HYBRID and LETTERBOX work correctly
// Coordinate transformation mode - HYBRID provides best accuracy
var COORD_TRANSFORM_MODE : CoordinateTransformMode = CoordinateTransformMode . HYBRID
// Class filtering for debugging
var DEBUG_CLASS_FILTER : String ? = null // Set to class name to show only that class
@ -81,10 +169,10 @@ class YOLOInferenceEngine(
private const val MIN_DEBUG_CONFIDENCE = 0.1f
private const val MAX_DEBUG_DETECTIONS_TO_LOG = 3
fun setCoordinateMode ( mode : String )
fun setCoordinateMode ( mode : CoordinateTransformMode )
{
COORD_TRANSFORM_MODE = mode
PGHLog . i ( TAG , " 🔧 Coordinate transform mode changed to: $mode " )
PGHLog . i ( TAG , " 🔧 Coordinate transform mode changed to: ${ mode::class.simpleName} " )
}
fun toggleShowAllConfidences ( )
@ -675,7 +763,20 @@ class YOLOInferenceEngine(
private data class TransformedCoordinates ( val x1 : Float , val y1 : Float , val x2 : Float , val y2 : Float )
/ * *
* Transform coordinates from model output to original image space
* Transform coordinates from model output to original image space .
*
* This is one of the most critical functions for detection accuracy . YOLO models output
* coordinates in normalized space ( typically 0.0 - 1.0 or model input dimensions ) .
* We need to map these back to the original screen / image coordinates .
*
* @param rawX1 Left coordinate from model output
* @param rawY1 Top coordinate from model output
* @param rawX2 Right coordinate from model output
* @param rawY2 Bottom coordinate from model output
* @param originalWidth Original image / screen width
* @param originalHeight Original image / screen height
* @param inputScale Model input size ( e . g . , 640 for 640 x640 model )
* @return Transformed coordinates in original image space
* /
private fun transformCoordinates (
rawX1 : Float , rawY1 : Float , rawX2 : Float , rawY2 : Float ,
@ -684,7 +785,7 @@ class YOLOInferenceEngine(
{
return when ( COORD_TRANSFORM_MODE )
{
" LETTERBOX " ->
CoordinateTransformMode . LETTERBOX ->
{
val letterbox_params = calculateLetterboxInverse ( originalWidth , originalHeight , inputScale )
val scale_x = letterbox_params [ 0 ]
@ -699,7 +800,7 @@ class YOLOInferenceEngine(
y2 = ( rawY2 - offset_y ) * scale_y
)
}
" DIRECT " ->
CoordinateTransformMode . DIRECT ->
{
val direct_scale_x = originalWidth . toFloat ( ) / inputScale . toFloat ( )
val direct_scale_y = originalHeight . toFloat ( ) / inputScale . toFloat ( )
@ -711,7 +812,7 @@ class YOLOInferenceEngine(
y2 = rawY2 * direct_scale_y
)
}
" HYBRID " ->
CoordinateTransformMode . HYBRID ->
{
val letterbox_params = calculateLetterboxInverse ( originalWidth , originalHeight , inputScale )
val offset_x = letterbox_params [ 2 ]
@ -754,7 +855,26 @@ class YOLOInferenceEngine(
}
/ * *
* Parse NMS ( Non - Maximum Suppression ) output format
* Parse NMS ( Non - Maximum Suppression ) output format .
*
* This function processes the post - processed output from a YOLO model that has already
* applied Non - Maximum Suppression . The typical NMS output format is :
* [ x1 , y1 , x2 , y2 , confidence , class_id ] per detection .
*
* Key responsibilities :
* 1. Extract bounding box coordinates ( x1 , y1 , x2 , y2 )
* 2. Extract confidence scores and apply thresholding
* 3. Extract class IDs and map to human - readable names
* 4. Transform coordinates from model space to original image space
* 5. Apply confidence mapping for mobile ONNX optimization
* 6. Filter by debug class if specified
* 7. Validate and clamp coordinates to image boundaries
*
* @param output Flattened float array from ONNX model output
* @param originalWidth Original image width for coordinate transformation
* @param originalHeight Original image height for coordinate transformation
* @param inputScale Model input size ( e . g . , 640 for 640 x640 model )
* @return List of Detection objects with transformed coordinates and metadata
* /
private fun parseNMSOutput ( output : FloatArray , originalWidth : Int , originalHeight : Int , inputScale : Int ) : List < Detection >
{