"""Window capture, image processing, and OCR utilities.
This module provides :class:`~autocv.core.vision.Vision`, which extends
:class:`~autocv.core.window_capture.WindowCapture` with a persistent image
backbuffer (``opencv_image``) and common operations needed by AutoCV:
- capturing frames into the backbuffer,
- extracting text via PaddleOCR,
- sampling, filtering, and analyzing colors,
- basic morphology, template matching, and contour helpers.
All image operations use OpenCV's BGR channel order unless explicitly stated.
Public color inputs/outputs use RGB tuples for consistency across the library.
"""
from __future__ import annotations
__all__ = ("Vision",)
import os
import warnings
from dataclasses import dataclass
from typing import TYPE_CHECKING, Final, Literal, TypedDict, cast
import cv2 as cv
import numpy as np
import numpy.typing as npt
from PIL import Image
from typing_extensions import Self
from paddleocr import PaddleOCR
from .decorators import check_valid_hwnd, check_valid_image
from .image_processing import filter_colors
from .window_capture import WindowCapture
if TYPE_CHECKING:
from collections.abc import Sequence
Color = tuple[int, int, int] # RGB channel order
Point = tuple[int, int]
Rect = tuple[int, int, int, int] # x, y, width, height
Contour = npt.NDArray[np.int32]
MaskArray = npt.NDArray[np.uint8]
NDArrayUint8 = npt.NDArray[np.uint8]
NDArrayInt16 = npt.NDArray[np.int16]
class OcrTextEntry(TypedDict):
"""Structured OCR output returned by :meth:`Vision.get_text`.
The ``rect`` field is ``[left, top, width, height]`` in backbuffer coordinates.
"""
text: str
rect: list[int]
confidence: float
RGBA_CHANNELS: Final[int] = 4
COLOR_IMAGE_NDIMS: Final[int] = 3
RGB_CHANNELS: Final[int] = 3
MAX_COLOR_VALUE: Final[int] = 255
SpeedPreset = Literal["fast", "balanced", "accurate"]
_OCR_CPU_THREADS: Final[int] = 8
_OCR_UPSCALE_FACTOR: Final[float] = 2.0
_OCR_BOX_LENGTH: Final[int] = 4
_OCR_BILATERAL_DIAMETER: Final[int] = 5
_OCR_BILATERAL_SIGMA_COLOR: Final[int] = 50
_OCR_BILATERAL_SIGMA_SPACE: Final[int] = 50
_GROUP_RECTANGLES_EPS: Final[float] = 0.1
_GROUP_RECTANGLES_THRESHOLD: Final[int] = 1
_DEFAULT_MORPH_KERNEL_SIZE: Final[int] = 3
@dataclass(frozen=True, slots=True)
class _OcrDetectionPreset:
"""PaddleOCR detection tuning for a speed preset."""
det_side_len: int
det_box_thresh: float
det_unclip_ratio: float
_OCR_PRESETS: Final[dict[str, _OcrDetectionPreset]] = {
"fast": _OcrDetectionPreset(det_side_len=640, det_box_thresh=0.55, det_unclip_ratio=1.6),
"balanced": _OcrDetectionPreset(det_side_len=960, det_box_thresh=0.50, det_unclip_ratio=1.8),
"accurate": _OcrDetectionPreset(det_side_len=1280, det_box_thresh=0.45, det_unclip_ratio=1.9),
}
[docs]
class Vision(WindowCapture):
"""Capture windows, process images, and perform OCR.
The class maintains a persistent OpenCV-compatible backbuffer in
:attr:`opencv_image`. Most image processing routines operate on this buffer.
Notes:
- Frames returned by :meth:`refresh` and stored in :attr:`opencv_image`
are in BGR channel order.
- Public color values are expressed as RGB tuples.
"""
def __init__(
self: Self,
hwnd: int = -1,
lang: str = "en",
device: str | None = None,
conf_threshold: float = 0.60,
speed: SpeedPreset = "balanced",
*,
disable_model_source_check: bool = False,
) -> None:
"""Initialise a Vision object.
Args:
hwnd: Window handle of the target window. Defaults to -1.
lang: PaddleOCR language code.
device: PaddleOCR device override (e.g. ``"cpu"`` / ``"gpu"``); ``None`` uses PaddleOCR defaults.
conf_threshold: OCR recognition confidence threshold between 0 and 1.
speed: Preset that tunes detection settings.
disable_model_source_check: When ``True``, disables PaddleOCR/PaddleX model host connectivity checks
via the ``DISABLE_MODEL_SOURCE_CHECK`` environment variable.
"""
super().__init__(hwnd)
self.opencv_image: NDArrayUint8 = np.empty(0, dtype=np.uint8)
self.api: PaddleOCR | None = None
if disable_model_source_check:
os.environ.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
preset = _OCR_PRESETS.get(speed, _OCR_PRESETS["balanced"])
self._ocr_lang: str = lang
self._ocr_device: str | None = device
self._ocr_conf_threshold: float = float(conf_threshold)
self._ocr_det_side_len: int = int(preset.det_side_len)
self._ocr_det_box_thresh: float = float(preset.det_box_thresh)
self._ocr_det_unclip_ratio: float = float(preset.det_unclip_ratio)
def _ensure_ocr(self: Self) -> PaddleOCR:
"""Return an initialised PaddleOCR instance, creating one on demand."""
if self.api is not None:
return self.api
try:
self.api = PaddleOCR(
# Core selection
lang=self._ocr_lang,
ocr_version="PP-OCRv5",
# OSRS/game UI: disable doc features
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
# Detection tuning
text_det_limit_type="max",
text_det_limit_side_len=self._ocr_det_side_len,
text_det_box_thresh=self._ocr_det_box_thresh,
text_det_unclip_ratio=self._ocr_det_unclip_ratio,
# Recognition filtering (TTS-friendly)
text_rec_score_thresh=self._ocr_conf_threshold,
# Runtime
device=self._ocr_device,
cpu_threads=_OCR_CPU_THREADS,
)
except ModuleNotFoundError as exc:
if exc.name != "paddle":
raise
msg = "PaddleOCR requires PaddlePaddle. Install `autocv[paddle-cpu]` or `autocv[paddle-gpu]`."
raise RuntimeError(msg) from exc
return self.api
@staticmethod
def _pil_to_bgr(image: Image.Image) -> NDArrayUint8:
"""Convert a PIL image to an OpenCV-compatible BGR array."""
rgb = np.array(image.convert("RGB"), dtype=np.uint8)
return cast("NDArrayUint8", cv.cvtColor(rgb, cv.COLOR_RGB2BGR))
[docs]
def set_backbuffer(self: Self, image: NDArrayUint8 | Image.Image) -> None:
"""Set the image buffer to the provided NumPy array or PIL Image.
Args:
image: Image data used to refresh the backbuffer.
"""
if isinstance(image, Image.Image):
self.opencv_image = self._pil_to_bgr(image)
else:
self.opencv_image = image
[docs]
@check_valid_hwnd
def refresh(self: Self, *, set_backbuffer: bool = True) -> NDArrayUint8 | None:
"""Capture the current window image and optionally persist it to ``opencv_image``.
Args:
set_backbuffer: When ``True``, persist the capture to ``self.opencv_image``;
otherwise return the captured frame.
Raises:
InvalidHandleError: Raised when ``self.hwnd`` is not a valid window handle.
Returns:
Captured frame when ``set_backbuffer`` is ``False``; ``None`` otherwise.
"""
frame = self.capture_frame(persist=False)
if set_backbuffer:
self.set_backbuffer(frame)
return None
return frame
[docs]
@check_valid_image
def save_backbuffer_to_file(self: Self, file_name: str) -> None:
"""Save the backbuffer image to a file.
Args:
file_name: Path where the backbuffer snapshot is stored.
"""
cv.imwrite(file_name, self.opencv_image)
[docs]
@check_valid_hwnd
@check_valid_image
def get_pixel_change(self: Self, area: Rect | None = None) -> int:
"""Calculate how many pixels changed between current and refreshed frames.
Args:
area: Region of interest expressed as (x, y, width, height); ``None``
inspects the full frame.
Raises:
InvalidImageError: Raised when the capture buffer is empty.
Returns:
int: Count of pixels with different intensities between frames.
"""
current_region = self._crop_image(area, self.opencv_image)
current_gray = cv.cvtColor(current_region, cv.COLOR_BGR2GRAY)
updated_frame = self.capture_frame(persist=False)
updated_region = self._crop_image(area, updated_frame)
updated_gray = cv.cvtColor(updated_region, cv.COLOR_BGR2GRAY)
diff = cv.absdiff(current_gray, updated_gray)
return int(np.count_nonzero(diff))
def _crop_image(
self: Self,
rect: Rect | None = None,
image: NDArrayUint8 | None = None,
) -> NDArrayUint8:
"""Crop the current image or a provided image to the specified rectangle.
Args:
rect: Region to crop in ``(x, y, width, height)`` form.
image: Explicit image to operate on; defaults to ``self.opencv_image``.
Returns:
Cropped slice of the source image.
Raises:
ValueError: If ``rect`` has non-positive dimensions, negative coordinates, or lies outside the image.
"""
image = image if image is not None else self.opencv_image
if rect is None:
return image
x, y, w, h = rect
if w <= 0 or h <= 0:
msg = "Crop rectangle must have positive width and height."
raise ValueError(msg)
if x < 0 or y < 0:
msg = "Crop rectangle coordinates must be non-negative."
raise ValueError(msg)
height, width = image.shape[:2]
if x >= width or y >= height:
msg = "Crop rectangle lies outside the image bounds."
raise ValueError(msg)
right = min(x + w, width)
bottom = min(y + h, height)
if right <= x or bottom <= y:
msg = "Crop rectangle lies outside the image bounds."
raise ValueError(msg)
return image[y:bottom, x:right]
@staticmethod
def _poly_to_bbox(poly: object) -> tuple[int, int, int, int]:
"""Convert a PaddleOCR polygon into an ``(x_min, y_min, x_max, y_max)`` box."""
points = np.asarray(poly, dtype=np.int32)
xs = points[:, 0]
ys = points[:, 1]
return int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())
@staticmethod
def _box_to_bbox(box: Sequence[float | int]) -> tuple[int, int, int, int]:
"""Convert a PaddleOCR box into integer ``(x_min, y_min, x_max, y_max)`` coordinates."""
if len(box) != _OCR_BOX_LENGTH:
msg = "OCR box must have four elements (x_min, y_min, x_max, y_max)."
raise ValueError(msg)
x_min, y_min, x_max, y_max = box
return int(x_min), int(y_min), int(x_max), int(y_max)
@staticmethod
def _extract_ocr_payload(prediction: object) -> dict[str, object] | None:
"""Normalize PaddleOCR predictions into a single JSON-like dictionary."""
payload: object = prediction
json_payload: object | None = getattr(prediction, "json", None)
if json_payload is not None:
payload = json_payload
if isinstance(payload, list):
payload_list = cast("list[object]", payload)
if not payload_list:
return None
payload = payload_list[0]
return cast("dict[str, object]", payload) if isinstance(payload, dict) else None
@staticmethod
def _prepare_ocr_input(image: NDArrayUint8) -> NDArrayUint8:
"""Preprocess an image for PaddleOCR inference."""
if image.ndim == COLOR_IMAGE_NDIMS and image.shape[-1] == RGB_CHANNELS:
gray: NDArrayUint8 = cast("NDArrayUint8", cv.cvtColor(image, cv.COLOR_BGR2GRAY))
else:
gray = image
gray = cast(
"NDArrayUint8",
cv.resize(
gray,
None,
fx=_OCR_UPSCALE_FACTOR,
fy=_OCR_UPSCALE_FACTOR,
interpolation=cv.INTER_LANCZOS4,
),
)
gray = cast(
"NDArrayUint8",
cv.bilateralFilter(gray, _OCR_BILATERAL_DIAMETER, _OCR_BILATERAL_SIGMA_COLOR, _OCR_BILATERAL_SIGMA_SPACE),
)
gray = cast("NDArrayUint8", cv.normalize(gray, gray, 0, MAX_COLOR_VALUE, cv.NORM_MINMAX))
return cast("NDArrayUint8", cv.cvtColor(gray, cv.COLOR_GRAY2BGR))
[docs]
@check_valid_image
def get_text(
self: Self,
rect: Rect | None = None,
colors: Color | Sequence[Color] | None = None,
tolerance: int = 0,
confidence: float | None = 0.8,
) -> list[OcrTextEntry]:
"""Extract text from the backbuffer using PaddleOCR.
Args:
rect: Search region (x, y, width, height).
colors: RGB colour(s) to isolate before OCR.
tolerance: Per-channel tolerance when matching the colour filter.
confidence: Minimum acceptable OCR confidence between 0 and 1. If None, no filtering.
Returns:
Text entries with bounding boxes and confidence levels, using backbuffer coordinates.
"""
image = self._crop_image(rect, self.opencv_image)
if colors:
image = filter_colors(image, colors, tolerance, keep_original_colors=True)
work = self._prepare_ocr_input(image)
payload = self._extract_ocr_payload(cast("object", self._ensure_ocr().predict(work)))
if payload is None:
return []
rec_texts = cast("list[str]", payload.get("rec_texts") or [])
rec_scores = cast("list[float]", payload.get("rec_scores") or [])
rec_boxes = cast("Sequence[Sequence[float | int]] | None", payload.get("rec_boxes"))
rec_polys = cast("Sequence[object] | None", payload.get("rec_polys"))
min_confidence = float(confidence) if confidence is not None else None
results: list[OcrTextEntry] = []
n = min(len(rec_texts), len(rec_scores))
for i in range(n):
text = str(rec_texts[i]).strip()
if not text:
continue
score = float(rec_scores[i])
if min_confidence is not None and score < min_confidence:
continue
bbox: tuple[int, int, int, int] | None = None
if rec_boxes is not None and i < len(rec_boxes):
bbox = self._box_to_bbox(rec_boxes[i])
elif rec_polys is not None and i < len(rec_polys):
bbox = self._poly_to_bbox(rec_polys[i])
if bbox is None:
continue
x_min_i, y_min_i, x_max_i, y_max_i = bbox
left = int(x_min_i / _OCR_UPSCALE_FACTOR)
top = int(y_min_i / _OCR_UPSCALE_FACTOR)
right = int(x_max_i / _OCR_UPSCALE_FACTOR)
bottom = int(y_max_i / _OCR_UPSCALE_FACTOR)
width = max(0, right - left)
height = max(0, bottom - top)
if rect is not None:
left += rect[0]
top += rect[1]
results.append(OcrTextEntry(text=text, rect=[left, top, width, height], confidence=score))
return results
[docs]
@check_valid_image
def get_color(self: Self, point: Point) -> Color:
"""Return the color of the pixel at the specified coordinates.
Args:
point: Pixel coordinates expressed as ``(x, y)``.
Returns:
Pixel colour as ``(R, G, B)``.
Raises:
InvalidImageError: Raised when ``self.opencv_image`` is empty.
ValueError: Raised when the backbuffer is not a 3-channel BGR image.
IndexError: If the coordinates are out of bounds.
"""
if self.opencv_image.ndim != COLOR_IMAGE_NDIMS or self.opencv_image.shape[-1] != RGB_CHANNELS:
msg = "Backbuffer must be a 3-channel BGR image."
raise ValueError(msg)
x, y = point
if not (0 <= x < self.opencv_image.shape[1] and 0 <= y < self.opencv_image.shape[0]):
raise IndexError(point)
b, g, r = self.opencv_image[y, x].tolist()
return int(r), int(g), int(b)
[docs]
@check_valid_image
def find_color(
self: Self,
color: Color,
rect: Rect | None = None,
tolerance: int = 0,
) -> list[Point]:
"""Find pixel coordinates matching a color within a tolerance.
Args:
color: Target colour expressed as ``(R, G, B)``.
rect: Optional search region described as ``(x, y, width, height)``.
tolerance: Allowed per-channel delta when searching for colour matches.
Returns:
Pixel coordinates in image space that match the colour constraint.
"""
image = self._crop_image(rect)
mask = filter_colors(image, color, tolerance)
points = np.column_stack(np.where(mask == MAX_COLOR_VALUE)[::-1])
if points.size == 0:
return []
if rect is not None:
points = points + np.array(rect[0:2], dtype=points.dtype)
return [(int(point[0]), int(point[1])) for point in points]
[docs]
@check_valid_image
def get_average_color(self: Self, rect: Rect | None = None) -> Color:
"""Calculate the average color within a specified region.
Args:
rect: Region to average; ``None`` uses the full image.
Returns:
Average RGB value inside the requested region.
"""
r, g, b = (int(x) for x in self._get_average_color(self.opencv_image, rect))
return r, g, b
def _get_average_color(
self: Self,
image: NDArrayUint8,
rect: Rect | None = None,
) -> NDArrayInt16:
"""Calculate the average color of the specified image region.
Args:
image: Image to sample in BGR order.
rect: Region to sample; defaults to the entire frame.
Returns:
Average colour in RGB channel order.
"""
image = self._crop_image(rect, image)
avg_color = cv.mean(image)
avg_color_bgr: NDArrayInt16
if isinstance(avg_color, tuple):
avg_color_bgr = np.array(avg_color[:3], dtype=np.int16)
else:
avg_color_bgr = np.array([avg_color, avg_color, avg_color], dtype=np.int16)
return avg_color_bgr[::-1]
@staticmethod
def _pack_bgr_frame(frame: NDArrayUint8) -> npt.NDArray[np.uint32]:
"""Pack a BGR frame into 24-bit integers for fast uniqueness queries."""
pixels_bgr = frame.reshape(-1, RGB_CHANNELS)
packed = (
pixels_bgr[:, 0].astype(np.uint32)
| (pixels_bgr[:, 1].astype(np.uint32) << 8)
| (pixels_bgr[:, 2].astype(np.uint32) << 16)
)
return cast("npt.NDArray[np.uint32]", packed)
@staticmethod
def _unpack_packed_bgr(value: int) -> Color:
"""Unpack a 24-bit packed BGR integer into an RGB tuple."""
b = value & 0xFF
g = (value >> 8) & 0xFF
r = (value >> 16) & 0xFF
return int(r), int(g), int(b)
[docs]
@check_valid_image
def get_most_common_color(
self: Self,
rect: Rect | None = None,
index: int = 1,
ignore_colors: Color | Sequence[Color] | None = None,
) -> Color:
"""Determines the most common color in the specified region.
Args:
rect: Region to sample; ``None`` uses the full image.
index: Rank of the dominant colour to extract (1-based).
ignore_colors: RGB colour(s) to skip while ranking.
Returns:
Most common RGB colour in the region.
Raises:
ValueError: If ``index`` is less than 1 or the requested region contains no pixels after filtering.
"""
if index < 1:
msg = "index must be >= 1."
raise ValueError(msg)
cropped_image = self._crop_image(rect)
packed = self._pack_bgr_frame(cropped_image)
if ignore_colors is not None:
ignore_arr = np.asarray(ignore_colors, dtype=np.uint32)
if ignore_arr.size:
if ignore_arr.ndim == 1:
ignore_arr = ignore_arr[np.newaxis, :]
if ignore_arr.shape[1] != RGB_CHANNELS:
msg = "ignore_colors must contain RGB triples."
raise ValueError(msg)
ignore_packed = ignore_arr[:, 2] | (ignore_arr[:, 1] << 8) | (ignore_arr[:, 0] << 16)
packed = packed[~np.isin(packed, ignore_packed)]
if packed.size == 0:
msg = "No pixels available in the requested region."
raise ValueError(msg)
unique, counts = np.unique(packed, return_counts=True)
sorted_indices = np.argsort(counts)[::-1]
desired_index = min(index - 1, len(sorted_indices) - 1)
chosen = int(unique[sorted_indices[desired_index]])
return self._unpack_packed_bgr(chosen)
[docs]
@check_valid_image
def get_count_of_color(
self: Self,
color: Color,
rect: Rect | None = None,
tolerance: int | None = 0,
) -> int:
"""Counts the number of pixels matching a given color within a tolerance.
Args:
color: Target colour expressed as ``(R, G, B)``.
rect: Region to sample; ``None`` uses the full image.
tolerance: Allowed per-channel difference; ``None`` is treated as 0 for exact matches.
Returns:
Number of pixels matching the specified colour.
"""
cropped_image = self._crop_image(rect)
match_mask = filter_colors(cropped_image, color, tolerance or 0)
return int(np.count_nonzero(match_mask))
[docs]
@check_valid_image
def get_all_colors_with_counts(
self: Self,
rect: Rect | None = None,
) -> list[tuple[Color, int]]:
"""Retrieves all colors in the specified region along with their pixel counts.
Args:
rect: Region to sample; ``None`` uses the full image.
Returns:
Colour counts ordered by frequency.
"""
cropped_image = self._crop_image(rect)
packed = self._pack_bgr_frame(cropped_image)
unique, counts = np.unique(packed, return_counts=True)
sorted_indices = np.argsort(counts)[::-1]
results: list[tuple[Color, int]] = []
for packed_color, count in zip(unique[sorted_indices], counts[sorted_indices], strict=False):
results.append((self._unpack_packed_bgr(int(packed_color)), int(count)))
return results
@staticmethod
def _get_dominant_color(image: NDArrayUint8) -> NDArrayUint8:
"""Return the median colour (a proxy for a dominant colour) in the given image.
Args:
image: Input frame in BGR order.
Returns:
Median colour in BGR order.
"""
reshaped_image = image.reshape(-1, RGB_CHANNELS)
return cast("NDArrayUint8", np.median(reshaped_image, axis=0).astype(np.uint8))
[docs]
@check_valid_image
def maximize_color_match(
self: Self,
rect: Rect,
initial_tolerance: int = 100,
tolerance_step: int = 1,
) -> tuple[Color, int]:
"""Finds the color and tolerance that best match the region's dominant color.
Args:
rect: Region to evaluate when computing the dominant colour.
initial_tolerance: Initial tolerance applied when searching.
tolerance_step: Amount to decrease tolerance when narrowing the search.
Returns:
Matched RGB colour and the tolerance applied.
"""
if initial_tolerance < 0:
msg = "initial_tolerance must be non-negative."
raise ValueError(msg)
if tolerance_step <= 0:
msg = "tolerance_step must be >= 1."
raise ValueError(msg)
cropped_image = self._crop_image(rect)
b, g, r = self._get_dominant_color(cropped_image).tolist()
dominant_color = (int(b), int(g), int(r))
best_color, best_tolerance = self._find_best_color_match(
cropped_image, dominant_color, initial_tolerance, tolerance_step
)
return best_color, best_tolerance
def _find_best_color_match(
self: Self,
cropped_image: NDArrayUint8,
dominant_color: tuple[int, int, int],
initial_tolerance: int,
tolerance_step: int,
) -> tuple[Color, int]:
"""Searches for the best color match within the specified tolerance range.
Args:
cropped_image: Image region under evaluation (BGR).
dominant_color: Dominant colour used for comparison (BGR).
initial_tolerance: Starting tolerance before decrements.
tolerance_step: Amount to reduce the tolerance between attempts.
Returns:
Matched RGB colour with the tolerance applied.
"""
if cropped_image.size == 0:
msg = "cropped_image must not be empty."
raise ValueError(msg)
if initial_tolerance < 0:
msg = "initial_tolerance must be non-negative."
raise ValueError(msg)
if tolerance_step <= 0:
msg = "tolerance_step must be >= 1."
raise ValueError(msg)
tolerance = initial_tolerance
best_tolerance = 0
best_ratio = -1.0
inner_total_pixels = cropped_image.size // RGB_CHANNELS
outer_total_pixels = (self.opencv_image.size // RGB_CHANNELS) - inner_total_pixels
while tolerance >= 0:
lower_bound, upper_bound = self._get_color_bounds(dominant_color, tolerance)
pixel_count, outside_pixel_count = self._get_pixel_counts(cropped_image, lower_bound, upper_bound)
inner_ratio = pixel_count / inner_total_pixels
outer_ratio = (outside_pixel_count / outer_total_pixels) if outer_total_pixels else 0.0
ratio = inner_ratio / (outer_ratio + 1.0)
if ratio > best_ratio:
best_ratio = ratio
best_tolerance = tolerance
tolerance -= tolerance_step
best_color_rgb = (dominant_color[2], dominant_color[1], dominant_color[0]) # Convert BGR to RGB.
return best_color_rgb, best_tolerance
@staticmethod
def _get_color_bounds(dominant_color: tuple[int, int, int], tolerance: int) -> tuple[NDArrayUint8, NDArrayUint8]:
"""Calculates lower and upper bounds for a color given a tolerance.
Args:
dominant_color: Target colour used to refine matches (BGR).
tolerance: Channel tolerance currently applied to the search.
Returns:
Lower and upper BGR bounds.
"""
dominant = np.asarray(dominant_color, dtype=np.int16)
lower_bound: NDArrayUint8 = np.clip(dominant - tolerance, 0, MAX_COLOR_VALUE).astype(np.uint8)
upper_bound: NDArrayUint8 = np.clip(dominant + tolerance, 0, MAX_COLOR_VALUE).astype(np.uint8)
return lower_bound, upper_bound
def _get_pixel_counts(
self: Self,
cropped_image: NDArrayUint8,
lower_bound: NDArrayUint8,
upper_bound: NDArrayUint8,
) -> tuple[int, int]:
"""Counts pixels within the specified color bounds in the cropped and main images.
Args:
cropped_image: Image region extracted for analysis (BGR).
lower_bound: Lower inclusive colour bound (BGR).
upper_bound: Upper inclusive colour bound (BGR).
Returns:
tuple[int, int]: Pixel counts inside the region and outside it.
"""
mask = cv.inRange(cropped_image, lower_bound, upper_bound)
pixel_count = int(np.count_nonzero(mask))
outside_mask = cv.inRange(self.opencv_image, lower_bound, upper_bound)
outside_pixel_count = int(np.count_nonzero(outside_mask)) - pixel_count
return pixel_count, outside_pixel_count
@staticmethod
def _calculate_median_difference(
image1: NDArrayUint8,
image2: NDArrayUint8,
mask: NDArrayUint8 | None = None,
) -> int:
"""Calculate the median absolute difference between two images.
Args:
image1: The first image.
image2: The second image.
mask: Optional inclusion mask; pixels where ``mask`` is zero are ignored.
Returns:
int: Median colour difference, or ``-1`` when shapes mismatch or no pixels remain after masking.
"""
if image1.shape != image2.shape:
return -1
image1_f = image1.astype(np.float32)
image2_f = image2.astype(np.float32)
if mask is not None:
mask_expanded = mask.astype(bool)[..., None]
image1_f = np.where(mask_expanded, image1_f, np.nan)
image2_f = np.where(mask_expanded, image2_f, np.nan)
diff = np.abs(image1_f - image2_f)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "All-NaN slice encountered", RuntimeWarning)
median_diff = np.nanmedian(diff)
if np.isnan(median_diff):
return -1
return int(median_diff)
[docs]
@check_valid_image
def erode_image(self: Self, iterations: int = 1, kernel: NDArrayUint8 | None = None) -> None:
"""Applies morphological erosion to the backbuffer image.
Args:
iterations: Number of erosion passes to run. Defaults to 1.
kernel: Structuring element to use; defaults to a 3x3 ones matrix.
"""
if kernel is None:
kernel = np.ones((_DEFAULT_MORPH_KERNEL_SIZE, _DEFAULT_MORPH_KERNEL_SIZE), np.uint8)
self.opencv_image = cast(
"NDArrayUint8",
cv.erode(self.opencv_image, kernel, iterations=iterations),
)
[docs]
@check_valid_image
def dilate_image(self: Self, iterations: int = 1, kernel: NDArrayUint8 | None = None) -> None:
"""Applies morphological dilation to the backbuffer image.
Args:
iterations: Number of dilation passes to run. Defaults to 1.
kernel: Structuring element to use; defaults to a 3x3 ones matrix.
"""
if kernel is None:
kernel = np.ones((_DEFAULT_MORPH_KERNEL_SIZE, _DEFAULT_MORPH_KERNEL_SIZE), np.uint8)
self.opencv_image = cast(
"NDArrayUint8",
cv.dilate(self.opencv_image, kernel, iterations=iterations),
)
[docs]
@check_valid_image
def find_image(
self: Self,
sub_image: NDArrayUint8 | Image.Image,
rect: Rect | None = None,
confidence: float = 0.95,
median_tolerance: int | None = None,
) -> list[Rect]:
"""Finds occurrences of a subimage within the main image using template matching.
Args:
sub_image: Template image in RGB/RGBA ordering.
rect: Search region specified as ``(x, y, width, height)``. If ``None``, the entire image is used.
confidence: Matching confidence threshold (default ``0.95``).
median_tolerance: Optional per-channel median colour tolerance for matches.
Returns:
Bounding boxes locating the subimage.
"""
image = self._crop_image(rect)
sub_image_bgr, mask = self._prepare_sub_image(sub_image)
main_image_gray, sub_image_gray = self._convert_to_grayscale(image, sub_image_bgr)
res = self._perform_template_matching(main_image_gray, sub_image_gray, mask, confidence)
rects = self._process_matching_results(res, image, sub_image_bgr, mask, rect, median_tolerance)
return self._group_and_convert_to_shape_list(rects)
@staticmethod
def _prepare_sub_image(
sub_image: NDArrayUint8 | Image.Image,
) -> tuple[NDArrayUint8, NDArrayUint8 | None]:
"""Prepares the subimage for template matching.
Converts the subimage to BGR format and extracts the alpha channel as a mask if present.
Args:
sub_image: Template image in RGB/RGBA ordering.
Returns:
Prepared template and optional mask.
Raises:
ValueError: If ``sub_image`` does not have shape ``(H, W, 3)`` or ``(H, W, 4)``.
"""
if isinstance(sub_image, Image.Image):
sub_image_arr: NDArrayUint8 = np.array(sub_image.convert("RGBA"), dtype=np.uint8)
else:
sub_image_arr = sub_image
if sub_image_arr.ndim != COLOR_IMAGE_NDIMS or sub_image_arr.shape[-1] not in (RGB_CHANNELS, RGBA_CHANNELS):
msg = "sub_image must have shape (H, W, 3) or (H, W, 4)."
raise ValueError(msg)
if sub_image_arr.shape[-1] == RGBA_CHANNELS:
sub_alpha = sub_image_arr[..., 3]
sub_image_bgr = cv.cvtColor(sub_image_arr[..., :3], cv.COLOR_RGB2BGR)
mask = sub_alpha
else:
sub_image_bgr = cv.cvtColor(sub_image_arr, cv.COLOR_RGB2BGR)
mask = None
return cast("NDArrayUint8", sub_image_bgr), mask
@staticmethod
def _convert_to_grayscale(
main_image: NDArrayUint8, sub_image_bgr: NDArrayUint8
) -> tuple[NDArrayUint8, NDArrayUint8]:
"""Converts the main and subimages to grayscale.
Args:
main_image: Reference image to search within.
sub_image_bgr: Template in BGR colour space.
Returns:
Grayscale main image and template.
"""
main_image_gray = cv.cvtColor(main_image, cv.COLOR_BGR2GRAY)
sub_image_gray = cv.cvtColor(sub_image_bgr, cv.COLOR_BGR2GRAY)
return cast("NDArrayUint8", main_image_gray), cast("NDArrayUint8", sub_image_gray)
@staticmethod
def _perform_template_matching(
main_image_gray: NDArrayUint8,
sub_image_gray: NDArrayUint8,
mask: NDArrayUint8 | None,
confidence: float,
) -> MaskArray:
"""Performs template matching between the main image and subimage.
Args:
main_image_gray: Main frame in grayscale.
sub_image_gray: Template converted to grayscale for matching.
mask: Optional template mask.
confidence: Minimum required match confidence.
Returns:
MaskArray: Mask where template scores exceed the confidence threshold.
"""
res = cv.matchTemplate(main_image_gray, sub_image_gray, cv.TM_CCORR_NORMED, mask=mask)
mask_arr: MaskArray = np.logical_and(res >= confidence, np.logical_not(np.isinf(res))).astype(np.uint8)
return mask_arr
def _process_matching_results(
self: Self,
res: MaskArray,
main_image: NDArrayUint8,
sub_image_bgr: NDArrayUint8,
mask: NDArrayUint8 | None,
rect: Rect | None,
median_tolerance: int | None,
) -> list[Rect]:
"""Processes template matching results to extract matching rectangles.
Args:
res: Binary mask produced by the template comparison.
main_image: Reference image to search within (BGR).
sub_image_bgr: Template in BGR colour space.
mask: Optional template mask.
rect: Region to constrain the template search.
median_tolerance: Optional tolerance applied to colour medians.
Returns:
Bounding boxes for detected matches.
"""
rects: list[Rect] = []
template_height, template_width = sub_image_bgr.shape[:2]
offset_x = rect[0] if rect is not None else 0
offset_y = rect[1] if rect is not None else 0
for y, x in np.column_stack(np.where(res)):
y_i = int(y)
x_i = int(x)
main_image_region = main_image[y_i : y_i + template_height, x_i : x_i + template_width]
found_rect = (x_i + offset_x, y_i + offset_y, template_width, template_height)
if median_tolerance is not None:
found_median_diff = self._calculate_median_difference(main_image_region, sub_image_bgr, mask)
if found_median_diff < median_tolerance:
rects.append(found_rect)
else:
rects.append(found_rect)
return rects
@staticmethod
def _group_and_convert_to_shape_list(
rects: list[Rect],
) -> list[Rect]:
"""Group similar rectangles and return a consolidated list.
Args:
rects: Bounding boxes emitted by the matcher.
Returns:
Grouped rectangles merged by OpenCV's clustering.
"""
if not rects:
return []
rects_list: list[list[int]] = []
for rect in rects:
x, y, w, h = rect
expanded = [int(x), int(y), int(w), int(h)]
rects_list.append(expanded)
rects_list.append(expanded.copy())
grouped_rects, _ = cv.groupRectangles(
rects_list,
groupThreshold=_GROUP_RECTANGLES_THRESHOLD,
eps=_GROUP_RECTANGLES_EPS,
)
return [(int(r[0]), int(r[1]), int(r[2]), int(r[3])) for r in grouped_rects]
[docs]
@check_valid_image
def find_contours(
self: Self,
color: Color,
rect: Rect | None = None,
tolerance: int = 0,
min_area: int = 10,
vertices: int | None = None,
*,
close_and_dilate: bool = False,
) -> list[Contour]:
"""Find contours in the backbuffer that match a color.
Args:
color: Target colour expressed as ``(R, G, B)``.
rect: Search region specified as ``(x, y, width, height)``. If ``None``, the entire image is used.
tolerance: Allowed deviation per colour channel.
min_area: Minimum area in pixels squared for a contour to qualify.
vertices: Required vertex count for returned contours.
close_and_dilate: When ``True``, apply a 3x3 morphological close and dilation pass to the filtered mask
before extracting contours.
Returns:
Contours matching the search criteria.
"""
image = self._crop_image(rect)
mask: MaskArray = filter_colors(image, color, tolerance)
if close_and_dilate:
kernel = np.ones((_DEFAULT_MORPH_KERNEL_SIZE, _DEFAULT_MORPH_KERNEL_SIZE), np.uint8)
mask = cast("MaskArray", cv.morphologyEx(mask, cv.MORPH_CLOSE, kernel, iterations=1))
mask = cast("MaskArray", cv.dilate(mask, kernel, iterations=1))
contours, _ = cv.findContours(mask, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
if rect is not None:
offset = np.array([rect[0], rect[1]], dtype=np.int32)
contours = [c + offset for c in contours]
contours = [c for c in contours if cv.contourArea(c) >= min_area]
if vertices is not None:
contours = [
c
for c in contours
if vertices == len(cv.approxPolyDP(c, 0.01 * cv.arcLength(c, closed=True), closed=True))
]
return cast("list[Contour]", contours)
[docs]
@check_valid_image
def draw_points(
self: Self,
points: Sequence[Point],
color: Color = (MAX_COLOR_VALUE, 0, 0),
) -> None:
"""Draws points on the backbuffer image.
Args:
points: Coordinates to mark on the backbuffer.
color: Drawing colour (RGB). Defaults to red.
"""
if not points:
return
points_arr = np.asarray(points, dtype=np.int64)
self.opencv_image[points_arr[:, 1], points_arr[:, 0]] = color[::-1]
[docs]
@check_valid_image
def draw_contours(
self: Self,
contours: Contour | Sequence[Contour],
color: Color = (MAX_COLOR_VALUE, 0, 0),
) -> None:
"""Draws contours on the backbuffer image.
Args:
contours: Contour(s) as produced by OpenCV.
color: Drawing colour (RGB). Defaults to red.
"""
contours_to_draw = [contours] if isinstance(contours, np.ndarray) else list(contours)
if not contours_to_draw:
return
cv.drawContours(self.opencv_image, cast("list[Contour]", contours_to_draw), -1, color[::-1], 2)
[docs]
@check_valid_image
def draw_circle(
self: Self,
circle: tuple[int, int, int],
color: Color = (MAX_COLOR_VALUE, 0, 0),
) -> None:
"""Draws a circle on the backbuffer image.
Args:
circle: Circle definition ``(x, y, radius)``.
color: Drawing colour (RGB). Defaults to red.
"""
x, y, r = circle
cv.circle(self.opencv_image, (x, y), r, color[::-1], 2, cv.LINE_4)
[docs]
@check_valid_image
def draw_rectangle(
self: Self,
rect: Rect,
color: Color = (MAX_COLOR_VALUE, 0, 0),
) -> None:
"""Draws a rectangle on the backbuffer image.
Args:
rect: Rectangle specified as ``(x, y, width, height)``.
color: Drawing colour (RGB). Defaults to red.
"""
x, y, w, h = rect
cv.rectangle(self.opencv_image, (x, y), (x + w, y + h), color[::-1], 2, cv.LINE_4)
[docs]
@check_valid_image
def filter_colors(
self: Self,
colors: Color | Sequence[Color],
tolerance: int = 0,
*,
keep_original_colors: bool = False,
) -> None:
"""Filters the backbuffer image to retain only specified colors within a given tolerance.
Args:
colors: Colours to keep while filtering.
tolerance: Per-channel tolerance threshold (0-255).
keep_original_colors: When ``True``, retain source colours for matching pixels; otherwise replace the
backbuffer with a binary mask.
"""
filtered_image = filter_colors(self.opencv_image, colors, tolerance, keep_original_colors=keep_original_colors)
self.opencv_image = filtered_image