"""This module provides the Vision class and related utilities for image capture, processing, and OCR.
It integrates with OpenCV, pytesseract, and other libraries to perform image analysis,
text extraction, and various image manipulation operations.
"""
from __future__ import annotations
__all__ = ("Vision",)
import io
import logging
import pathlib
from typing import TYPE_CHECKING, cast
import cv2 as cv
import numpy as np
import numpy.typing as npt
import polars as pl
import win32con
import win32gui
import win32ui
from PIL import Image
from tesserocr import OEM, PSM, PyTessBaseAPI
from typing_extensions import Self
from .decorators import check_valid_hwnd, check_valid_image
from .image_processing import filter_colors
from .window_capture import WindowCapture
if TYPE_CHECKING:
from collections.abc import Sequence
RECTANGLE_SIDES = 4
GRESCALE_CHANNELS = 3
MAX_COLOR_VALUE = 255
ONE_HUNDRED = 100
logger = logging.getLogger(__name__)
[docs]
class Vision(WindowCapture):
"""A class for image processing and optical character recognition (OCR).
Extends the WindowCapture class to provide methods for capturing a window,
processing the image, and extracting text and color information.
"""
def __init__(self: Self, hwnd: int = -1) -> None:
"""Initializes a Vision object.
Args:
hwnd (int): Window handle of the target window. Defaults to -1.
"""
super().__init__(hwnd)
# Holds the image in OpenCV format.
self.opencv_image: npt.NDArray[np.uint8] = np.empty(0, dtype=np.uint8)
# Define the path to the tessdata directory and set up Tesseract configuration.
absolute_directory = pathlib.Path(__file__).parents[1] / "data" / "traineddata"
# noinspection PyArgumentList
self.api = PyTessBaseAPI(
path=str(absolute_directory),
lang="runescape",
psm=PSM.SPARSE_TEXT,
oem=OEM.LSTM_ONLY,
)
self._config = rf"--tessdata-dir {absolute_directory} --oem 1 --psm 11"
[docs]
def set_backbuffer(self: Self, image: npt.NDArray[np.uint8] | Image.Image) -> None:
"""Sets the image buffer to the provided NumPy array or PIL Image.
Args:
image (npt.NDArray[np.uint8] | Image.Image): Image data used to refresh the backbuffer.
"""
if isinstance(image, Image.Image):
self.opencv_image = cast(
"npt.NDArray[np.uint8]",
cv.cvtColor(np.array(image), cv.COLOR_RGB2BGR),
)
else:
self.opencv_image = image
[docs]
@check_valid_hwnd
def refresh(self: Self, *, set_backbuffer: bool = True) -> npt.NDArray[np.uint8] | None:
"""Captures the current window image and converts it to an OpenCV-compatible format.
Args:
set_backbuffer (bool): When ``True``, persist the capture to ``self.opencv_image``.
If False, returns the captured image.
Raises:
InvalidHandleError: Raised when ``self.hwnd`` is not a valid window handle.
Returns:
npt.NDArray[np.uint8] | None: Captured frame when ``set_backbuffer`` is ``False``; ``None`` otherwise.
"""
# Get window dimensions.
left, top, right, bottom = win32gui.GetWindowRect(self.hwnd)
width = right - left
height = bottom - top
# Get device context and compatible bitmap.
window_dc = win32gui.GetWindowDC(self.hwnd)
mem_dc = win32ui.CreateDCFromHandle(window_dc)
bmp_dc = mem_dc.CreateCompatibleDC()
bitmap = win32ui.CreateBitmap()
bitmap.CreateCompatibleBitmap(mem_dc, width, height)
bmp_dc.SelectObject(bitmap)
# Copy window image data onto bitmap.
bmp_dc.BitBlt((0, 0), (width, height), mem_dc, (0, 0), win32con.SRCCOPY)
# Convert raw data into a format that OpenCV can read.
signed_ints_array = bitmap.GetBitmapBits(True)
img = np.frombuffer(signed_ints_array, dtype="uint8")
img.shape = (height, width, 4)
# Free resources.
mem_dc.DeleteDC()
bmp_dc.DeleteDC()
win32gui.ReleaseDC(self.hwnd, window_dc)
win32gui.DeleteObject(bitmap.GetHandle())
# Ensure the image is contiguous and drop the alpha channel.
image = np.ascontiguousarray(img[..., :3])
if set_backbuffer:
self.set_backbuffer(image)
return None
return image
[docs]
@check_valid_image
def save_backbuffer_to_file(self: Self, file_name: str) -> None:
"""Saves the backbuffer image to a file.
Args:
file_name (str): Path where the backbuffer snapshot is stored.
"""
cv.imwrite(file_name, self.opencv_image)
[docs]
@check_valid_hwnd
def get_pixel_change(self: Self, area: tuple[int, int, int, int] | None = None) -> int:
"""Calculates the number of pixels that have changed between the current image and a newly captured image.
Args:
area (tuple[int, int, int, int] | None): Region of interest expressed as (x, y, width, height); ``None``
inspects the full frame. If None, the entire image is used.
Raises:
InvalidImageError: Raised when the capture buffer is empty.
Returns:
int: Count of pixels with different intensities between frames.
"""
# Crop and convert the current image to grayscale.
image = self._crop_image(area, self.opencv_image)
image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
# Capture a new image and convert it to grayscale.
updated_image = self.refresh(set_backbuffer=False)
updated_image = self._crop_image(area, updated_image)
updated_image = cv.cvtColor(updated_image, cv.COLOR_BGR2GRAY)
# Compute and return the difference.
diff = cv.absdiff(image, updated_image)
return int(np.count_nonzero(diff))
def _get_grouped_text(
self: Self,
image: npt.NDArray[np.uint8],
rect: tuple[int, int, int, int] | None = None,
colors: tuple[int, int, int] | list[tuple[int, int, int]] | None = None,
tolerance: int = 0,
) -> pl.LazyFrame:
"""Preprocesses the image and extracts text using Tesseract OCR, grouping text data by block.
Args:
image (npt.NDArray[np.uint8]): Image to preprocess before OCR.
rect (tuple[int, int, int, int] | None): Optional region of interest described as
(x, y, width, height).
colors (tuple[int, int, int] | list[tuple[int, int, int]] | None): Colours (RGB) to isolate before OCR.
tolerance (int): Per-channel tolerance when matching the colour filter.
Returns:
pl.LazyFrame: Grouped OCR output including text and bounding boxes.
"""
if colors:
image = filter_colors(image, colors, tolerance)
# Resize image to double the size.
resized_img = cv.resize(image, None, fx=2, fy=2, interpolation=cv.INTER_CUBIC)
img = cv.bilateralFilter(resized_img, 9, 75, 75)
# Convert to grayscale and apply thresholding.
if len(img.shape) == GRESCALE_CHANNELS and img.shape[-1] == GRESCALE_CHANNELS:
img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
img = cv.threshold(img, 0, MAX_COLOR_VALUE, cv.THRESH_BINARY + cv.THRESH_OTSU)[1]
img = cv.bitwise_not(img)
# Perform OCR.
self.api.SetImage(Image.fromarray(img))
if rect:
self.api.SetRectangle(*rect)
text_data = self.api.GetTSVText(0)
self.api.Clear()
text = pl.scan_csv(
io.StringIO(text_data),
has_header=False,
separator="\t",
quote_char=None,
new_columns=[
"level",
"page_num",
"block_num",
"par_num",
"line_num",
"word_num",
"left",
"top",
"width",
"height",
"conf",
"text",
],
)
# Filter out invalid or low-confidence text.
text = text.filter((pl.col("conf") > 0) & (pl.col("conf") < ONE_HUNDRED))
text = text.with_columns(
[
pl.col("text").cast(pl.Utf8),
(pl.col("conf") / 100).alias("conf"),
]
)
text = text.filter(pl.col("text").str.strip_chars().str.len_chars() > 0)
grouped_text = text.group_by("block_num").agg(
[
pl.col("word_num").max().alias("word_num"),
pl.col("left").min().alias("left"),
pl.col("top").min().alias("top"),
pl.col("height").max().alias("height"),
pl.col("conf").max().alias("confidence"),
pl.col("text").str.concat(" ").alias("text"),
((pl.col("left") + pl.col("width")).max() - pl.col("left").min()).alias("width"),
]
)
sorted_text = grouped_text.sort("confidence", descending=True)
return sorted_text.with_columns(
[
(pl.col("top") // 2).alias("top"),
(pl.col("left") // 2).alias("left"),
(pl.col("height") // 2).alias("height"),
(pl.col("width") // 2).alias("width"),
]
)
def _crop_image(
self: Self,
rect: tuple[int, int, int, int] | None = None,
image: npt.NDArray[np.uint8] | None = None,
) -> npt.NDArray[np.uint8]:
"""Crops the current image or a provided image to the specified rectangle.
Args:
rect (tuple[int, int, int, int] | None): Region to crop in
(x, y, width, height) form.
image (npt.NDArray[np.uint8] | None): Explicit image to operate on; defaults to ``self.opencv_image``.
Returns:
npt.NDArray[np.uint8]: Cropped slice of the source image.
"""
image = image if image is not None else self.opencv_image
if rect:
x, y, w, h = rect
return image[y : y + h, x : x + w]
return image
[docs]
@check_valid_image
def get_text(
self: Self,
rect: tuple[int, int, int, int] | None = None,
colors: tuple[int, int, int] | list[tuple[int, int, int]] | None = None,
tolerance: int = 0,
confidence: float | None = 0.8,
) -> list[dict[str, str | int | float | list[int]]]:
"""Extracts text from the backbuffer using Tesseract OCR.
Only text with confidence greater than or equal to the provided threshold is returned.
Args:
rect (tuple[int, int, int, int] | None): Search region specified as
(x, y, width, height).
colors (tuple[int, int, int] | list[tuple[int, int, int]] | None): Colours (RGB) to isolate before OCR.
tolerance (int): Per-channel tolerance when matching the colour filter.
confidence (float | None): Minimum acceptable OCR confidence between 0 and 1.
Returns:
list[dict[str, str | int | float | list[int]]]: Text entries with bounding boxes and confidence levels.
"""
sorted_text = self._get_grouped_text(self.opencv_image, rect, colors, tolerance)
acceptable_text = sorted_text.filter(pl.col("confidence") >= confidence)
acceptable_text = acceptable_text.select("text", "left", "top", "width", "height", "confidence")
if rect:
acceptable_text = acceptable_text.with_columns(pl.col("left") + rect[0], pl.col("top") + rect[1])
result_df = acceptable_text.with_columns(pl.concat_list(["left", "top", "width", "height"]).alias("rect"))
result_df = result_df.select(["text", "rect", "confidence"])
return cast("list[dict[str, str | int | float | list[int]]]", result_df.collect().to_dicts())
[docs]
@check_valid_image
def get_color(self: Self, point: tuple[int, int]) -> tuple[int, int, int]:
"""Returns the color of the pixel at the specified coordinates.
Args:
point (tuple[int, int]): Pixel coordinates expressed as (x, y).
Returns:
tuple[int, int, int]: Pixel colour as (R, G, B).
Raises:
InvalidImageError: Raised when ``self.opencv_image`` is empty.
IndexError: If the coordinates are out of bounds.
"""
x, y = point
if not (0 <= x < self.opencv_image.shape[1] and 0 <= y < self.opencv_image.shape[0]):
raise IndexError
return tuple(np.flip(self.opencv_image[y, x]))
[docs]
@check_valid_image
def find_color(
self: Self,
color: tuple[int, int, int],
rect: tuple[int, int, int, int] | None = None,
tolerance: int = 0,
) -> list[tuple[int, int]]:
"""Finds all pixel coordinates matching the given color within the specified tolerance.
Args:
color (tuple[int, int, int]): Target colour expressed as (R, G, B).
rect (tuple[int, int, int, int] | None): Optional search region
described as (x, y, width, height).
tolerance (int): Allowed per-channel delta when searching for colour matches.
Returns:
list[tuple[int, int]]: Pixel coordinates in image space that match the colour constraint.
"""
image = self._crop_image(rect)
mask = filter_colors(image, color, tolerance)
points = np.stack(np.where(mask == MAX_COLOR_VALUE)[::-1], axis=1)
if rect:
points = np.add(points, rect[0:2])
return cast("list[tuple[int, int]]", points.tolist())
[docs]
@check_valid_image
def get_average_color(self: Self, rect: tuple[int, int, int, int] | None = None) -> tuple[int, int, int]:
"""Calculates the average color within a specified region.
Args:
rect (tuple[int, int, int, int] | None): Region to average;
``None`` uses the full image.
Returns:
tuple[int, int, int]: Average RGB value inside the requested region.
"""
return cast("tuple[int, int, int]", tuple(int(x) for x in self._get_average_color(self.opencv_image, rect)))
def _get_average_color(
self: Self,
image: npt.NDArray[np.uint8],
rect: tuple[int, int, int, int] | None = None,
) -> npt.NDArray[np.int16]:
"""Calculates the average color of the specified image region.
Args:
image (npt.NDArray[np.uint8]): Image to preprocess before OCR.
rect (tuple[int, int, int, int] | None): Region to sample;
defaults to the entire frame.
Returns:
npt.NDArray[np.int16]: Average colour in BGR order prior to flipping to RGB.
"""
image = self._crop_image(rect, image)
avg_color = cv.mean(image)
avg_color_rgb = np.array(avg_color[:3], dtype=np.int16)
return avg_color_rgb[::-1]
[docs]
@check_valid_image
def get_most_common_color(
self: Self,
rect: tuple[int, int, int, int] | None = None,
index: int = 1,
ignore_colors: tuple[int, int, int] | list[tuple[int, int, int]] | None = None,
) -> tuple[int, int, int]:
"""Determines the most common color in the specified region.
Args:
rect (tuple[int, int, int, int] | None): Region to average;
``None`` uses the full image.
index (int): Rank of the dominant colour to extract (1-based).
ignore_colors (tuple[int, int, int] | list[tuple[int, int, int]] | None): Colours to skip while ranking.
Returns:
tuple[int, int, int]: Most common RGB colour in the region.
"""
cropped_image = self._crop_image(rect)
reshaped_image = cropped_image.reshape(-1, 3)
if ignore_colors is not None:
color_values = np.array(ignore_colors, dtype=np.int16)
if color_values.ndim == 1:
color_values = color_values[np.newaxis, :]
ignore_mask = np.isin(reshaped_image, color_values).any(axis=1)
reshaped_image = np.array(reshaped_image[~ignore_mask])
unique, counts = np.unique(reshaped_image, axis=0, return_counts=True)
sorted_indices = np.argsort(counts)[::-1]
desired_index = min(index - 1, len(sorted_indices) - 1)
most_common_color = unique[sorted_indices[desired_index]][::-1]
return (int(most_common_color[0]), int(most_common_color[1]), int(most_common_color[2]))
[docs]
@check_valid_image
def get_count_of_color(
self: Self,
color: tuple[int, int, int],
rect: tuple[int, int, int, int] | None = None,
tolerance: int | None = 0,
) -> int:
"""Counts the number of pixels matching a given color within a tolerance.
Args:
color (tuple[int, int, int]): Target colour expressed as (R, G, B).
rect (tuple[int, int, int, int] | None): Region to average;
``None`` uses the full image.
tolerance (int): Allowed per-channel difference; defaults to 0 for exact matches.
Returns:
int: Number of pixels matching the specified colour.
"""
cropped_image = self._crop_image(rect)
match_mask = filter_colors(cropped_image, color, tolerance or 0)
return int(np.count_nonzero(match_mask))
[docs]
@check_valid_image
def get_all_colors_with_counts(
self: Self,
rect: tuple[int, int, int, int] | None = None,
) -> list[tuple[tuple[int, int, int], int]]:
"""Retrieves all colors in the specified region along with their pixel counts.
Args:
rect (tuple[int, int, int, int] | None): Region to average;
``None`` uses the full image.
Returns:
list[tuple[tuple[int, int, int], int]]: Colour counts ordered by frequency.
"""
cropped_image = self._crop_image(rect)
reshaped_image = cropped_image.reshape(-1, 3)
unique, counts = np.unique(reshaped_image, axis=0, return_counts=True)
sorted_indices = np.argsort(counts)[::-1]
sorted_unique = unique[sorted_indices]
sorted_counts = counts[sorted_indices]
return [
((int(bgr[2]), int(bgr[1]), int(bgr[0])), int(c))
for bgr, c in zip(sorted_unique, sorted_counts, strict=False)
]
@staticmethod
def _get_dominant_color(image: npt.NDArray[np.uint8]) -> npt.NDArray[np.uint8]:
"""Returns the dominant color in the given image.
Args:
image (npt.NDArray[np.uint8]): Input frame in BGR order.
Returns:
npt.NDArray[np.uint8]: Dominant colour in BGR order.
"""
reshaped_image = image.reshape(-1, 3)
return cast("npt.NDArray[np.uint8]", np.median(reshaped_image, axis=0).astype(np.uint8))
[docs]
@check_valid_image
def maximize_color_match(
self: Self,
rect: tuple[int, int, int, int],
initial_tolerance: int = 100,
tolerance_step: int = 1,
) -> tuple[tuple[int, int, int], int]:
"""Finds the color and tolerance that best match the region's dominant color.
Args:
rect (tuple[int, int, int, int] | None): Region to evaluate when
computing the dominant colour.
initial_tolerance (int): Initial tolerance applied when searching.
tolerance_step (int): Amount to decrease tolerance when broadening the search.
Returns:
tuple[tuple[int, int, int], int]: Matched RGB colour and the tolerance applied.
"""
cropped_image = self._crop_image(rect)
dominant_color = cast("tuple[int, int, int]", tuple(int(x) for x in self._get_dominant_color(cropped_image)))
best_color, best_tolerance = self._find_best_color_match(
cropped_image, dominant_color, initial_tolerance, tolerance_step
)
return best_color, best_tolerance
def _find_best_color_match(
self: Self,
cropped_image: npt.NDArray[np.uint8],
dominant_color: tuple[int, int, int],
initial_tolerance: int,
tolerance_step: int,
) -> tuple[tuple[int, int, int], int]:
"""Searches for the best color match within the specified tolerance range.
Args:
cropped_image (npt.NDArray[np.uint8]): Image region under evaluation.
dominant_color (tuple[int, int, int]): Dominant colour used for comparison.
initial_tolerance (int): Starting tolerance before decrements.
tolerance_step (int): Amount to reduce the tolerance between attempts.
Returns:
tuple[tuple[int, int, int], int]: Matched RGB colour with the tolerance applied.
"""
tolerance = initial_tolerance
best_tolerance = 0
best_ratio = -1.0
inner_total_pixels = cropped_image.size // 3
outer_total_pixels = (self.opencv_image.size // 3) - inner_total_pixels
while tolerance >= 0:
lower_bound, upper_bound = self._get_color_bounds(dominant_color, tolerance)
pixel_count, outside_pixel_count = self._get_pixel_counts(cropped_image, lower_bound, upper_bound)
inner_ratio = pixel_count / inner_total_pixels
outer_ratio = outside_pixel_count / outer_total_pixels
ratio = inner_ratio / (outer_ratio + 1)
if ratio > best_ratio:
best_ratio = ratio
best_tolerance = tolerance
tolerance -= tolerance_step
best_color_rgb = (dominant_color[2], dominant_color[1], dominant_color[0]) # Convert BGR to RGB.
return best_color_rgb, best_tolerance
@staticmethod
def _get_color_bounds(
dominant_color: tuple[int, int, int], tolerance: int
) -> tuple[npt.NDArray[np.uint8], npt.NDArray[np.uint8]]:
"""Calculates lower and upper bounds for a color given a tolerance.
Args:
dominant_color (tuple[int, int, int]): Target colour used to refine matches.
tolerance (int): Channel tolerance currently applied to the search.
Returns:
tuple[npt.NDArray[np.uint8], npt.NDArray[np.uint8]]: Lower and upper BGR bounds.
"""
lower_bound = np.array(
[
max(dominant_color[0] - tolerance, 0),
max(dominant_color[1] - tolerance, 0),
max(dominant_color[2] - tolerance, 0),
]
)
upper_bound = np.array(
[
min(dominant_color[0] + tolerance, 255),
min(dominant_color[1] + tolerance, 255),
min(dominant_color[2] + tolerance, 255),
]
)
return lower_bound, upper_bound
def _get_pixel_counts(
self: Self,
cropped_image: npt.NDArray[np.uint8],
lower_bound: npt.NDArray[np.uint8],
upper_bound: npt.NDArray[np.uint8],
) -> tuple[int, int]:
"""Counts pixels within the specified color bounds in the cropped and main images.
Args:
cropped_image (npt.NDArray[np.uint8]): Image region extracted for analysis.
lower_bound (tuple[int, int, int]): Lower inclusive colour bound (BGR).
upper_bound (tuple[int, int, int]): Upper inclusive colour bound (BGR).
Returns:
tuple[int, int]: Pixel counts inside the region and outside it.
"""
mask = cv.inRange(cropped_image, lower_bound, upper_bound)
pixel_count = int(np.sum(mask == MAX_COLOR_VALUE))
outside_mask = cv.inRange(self.opencv_image, lower_bound, upper_bound)
outside_pixel_count = int(np.sum(outside_mask == MAX_COLOR_VALUE)) - pixel_count
return pixel_count, outside_pixel_count
@staticmethod
def _calculate_median_difference(
image1: npt.NDArray[np.uint8],
image2: npt.NDArray[np.uint8],
mask: npt.NDArray[np.uint8] | None = None,
) -> int:
"""Calculates the median absolute difference between two images, optionally ignoring masked areas.
Args:
image1: The first image.
image2: The second image.
mask (npt.NDArray[np.uint8] | None): Optional mask of pixels to ignore.
Returns:
int: Median colour difference, or ``-1`` when shapes mismatch.
"""
if image1.shape != image2.shape:
return -1
if mask is not None:
mask_expanded = np.expand_dims(mask, axis=2)
image1 = np.where(mask_expanded, image1, np.nan)
image2 = np.where(mask_expanded, image2, np.nan)
diff = np.abs(image1 - image2)
median_diff = np.nanmedian(diff)
return int(median_diff)
[docs]
@check_valid_image
def erode_image(self, iterations: int = 1, kernel: npt.NDArray[np.uint8] | None = None) -> None:
"""Applies morphological erosion to the backbuffer image.
Args:
iterations (int): Number of erosion passes to run. Defaults to 1.
kernel (npt.NDArray[np.uint8] | None): Structuring element to use; defaults to a 3x3 ones matrix.
"""
kernel = kernel or np.ones((3, 3), np.uint8)
self.opencv_image = cast(
"npt.NDArray[np.uint8]",
cv.erode(self.opencv_image, kernel, iterations=iterations),
)
[docs]
@check_valid_image
def dilate_image(self, iterations: int = 1, kernel: npt.NDArray[np.uint8] | None = None) -> None:
"""Applies morphological dilation to the backbuffer image.
Args:
iterations (int): Number of dilation passes to run. Defaults to 1.
kernel (npt.NDArray[np.uint8] | None): Structuring element to use; defaults to a 3x3 ones matrix.
"""
kernel = kernel or np.ones((3, 3), np.uint8)
self.opencv_image = cast(
"npt.NDArray[np.uint8]",
cv.dilate(self.opencv_image, kernel, iterations=iterations),
)
[docs]
@check_valid_image
def find_image(
self: Self,
sub_image: npt.NDArray[np.uint8] | Image.Image,
rect: tuple[int, int, int, int] | None = None,
confidence: float = 0.95,
median_tolerance: int | None = None,
) -> list[tuple[int, int, int, int]]:
"""Finds occurrences of a subimage within the main image using template matching.
Args:
sub_image (npt.NDArray[np.uint8] | Image.Image): Subimage to locate inside the backbuffer.
rect (tuple[int, int, int, int] | None): Search region specified as
(x, y, width, height). If None, the entire image is used.
confidence (float): Matching confidence threshold (default ``0.95``).
median_tolerance (int | None): Optional per-channel median colour tolerance for matches.
Returns:
list[tuple[int, int, int, int]]: Bounding boxes locating the subimage.
"""
image = self._crop_image(rect)
sub_image_bgr, mask = self._prepare_sub_image(sub_image)
main_image_gray, sub_image_gray = self._convert_to_grayscale(image, sub_image_bgr)
res = self._perform_template_matching(main_image_gray, sub_image_gray, mask, confidence)
rects = self._process_matching_results(res, image, sub_image_bgr, mask, rect, median_tolerance)
return self._group_and_convert_to_shape_list(rects)
@staticmethod
def _prepare_sub_image(
sub_image: npt.NDArray[np.uint8] | Image.Image,
) -> tuple[npt.NDArray[np.uint8], npt.NDArray[np.uint8] | None]:
"""Prepares the subimage for template matching.
Converts the subimage to BGR format and extracts the alpha channel as a mask if present.
Args:
sub_image (npt.NDArray[np.uint8] | Image.Image): Image used as a template when matching.
Returns:
tuple[npt.NDArray[np.uint8], npt.NDArray[np.uint8] | None]: Prepared template and optional mask.
"""
if isinstance(sub_image, Image.Image):
sub_image = np.array(sub_image.convert("RGBA"))
if sub_image.shape[-1] == RECTANGLE_SIDES:
sub_alpha = sub_image[..., 3]
sub_image_bgr = cv.cvtColor(sub_image[..., :3], cv.COLOR_RGB2BGR)
mask = sub_alpha.astype(np.uint8)
else:
sub_image_bgr = cv.cvtColor(sub_image, cv.COLOR_RGB2BGR)
mask = None
return cast("npt.NDArray[np.uint8]", sub_image_bgr), cast("npt.NDArray[np.uint8]", mask)
@staticmethod
def _convert_to_grayscale(
main_image: npt.NDArray[np.uint8], sub_image_bgr: npt.NDArray[np.uint8]
) -> tuple[npt.NDArray[np.uint8], npt.NDArray[np.uint8]]:
"""Converts the main and subimages to grayscale.
Args:
main_image (npt.NDArray[np.uint8]): Reference image to search within.
sub_image_bgr (npt.NDArray[np.uint8]): Template in BGR colour space.
Returns:
tuple[npt.NDArray[np.uint8], npt.NDArray[np.uint8]]: Grayscale main image and template.
"""
main_image_gray = cv.cvtColor(main_image, cv.COLOR_BGR2GRAY)
sub_image_gray = cv.cvtColor(sub_image_bgr, cv.COLOR_BGR2GRAY)
return cast("npt.NDArray[np.uint8]", main_image_gray), cast("npt.NDArray[np.uint8]", sub_image_gray)
@staticmethod
def _perform_template_matching(
main_image_gray: npt.NDArray[np.uint8],
sub_image_gray: npt.NDArray[np.uint8],
mask: npt.NDArray[np.uint8] | None,
confidence: float,
) -> npt.NDArray[np.uint8]:
"""Performs template matching between the main image and subimage.
Args:
main_image_gray (npt.NDArray[np.uint8]): Main frame in grayscale.
sub_image_gray (npt.NDArray[np.uint8]): Template converted to grayscale for matching.
mask (npt.NDArray[np.uint8] | None): Optional template mask.
confidence (float): Minimum required match confidence.
Returns:
npt.NDArray[np.bool_]: Mask where template scores exceed the confidence threshold.
"""
res = cv.matchTemplate(main_image_gray, sub_image_gray, cv.TM_CCORR_NORMED, mask=mask)
return cast(
"npt.NDArray[np.uint8]", np.logical_and(res >= confidence, np.logical_not(np.isinf(res))).astype(np.uint8)
)
def _process_matching_results(
self: Self,
res: npt.NDArray[np.uint8],
main_image: npt.NDArray[np.uint8],
sub_image_bgr: npt.NDArray[np.uint8],
mask: npt.NDArray[np.uint8] | None,
rect: tuple[int, int, int, int] | None,
median_tolerance: int | None,
) -> list[tuple[int, int, int, int]]:
"""Processes template matching results to extract matching rectangles.
Args:
res (npt.NDArray[np.uint8]): Binary mask produced by the template comparison.
main_image (npt.NDArray[np.uint8]): Reference image to search within.
sub_image_bgr (npt.NDArray[np.uint8]): Template in BGR colour space.
mask (npt.NDArray[np.uint8] | None): Optional mask applied to the subimage.
rect (tuple[int, int, int, int] | None): Region to constrain the
template search.
median_tolerance (int | None): Optional tolerance applied to colour medians.
Returns:
list[tuple[int, int, int, int]]: Bounding boxes for detected matches.
"""
rects = []
w, h = sub_image_bgr.shape[1::-1]
locations = np.column_stack(np.where(res))
for i in range(locations.shape[0]):
y = int(locations[i, 0])
x = int(locations[i, 1])
main_image_region = main_image[y : y + h, x : x + w]
found_rect = (
x + (rect[0] if rect else 0),
y + (rect[1] if rect else 0),
w,
h,
)
if median_tolerance is not None:
found_median_diff = self._calculate_median_difference(main_image_region, sub_image_bgr, mask)
if found_median_diff < median_tolerance:
rects.append(found_rect)
else:
rects.append(found_rect)
return rects
@staticmethod
def _group_and_convert_to_shape_list(
rects: list[tuple[int, int, int, int]],
) -> list[tuple[int, int, int, int]]:
"""Groups similar rectangles and returns a consolidated list.
Args:
rects (list[tuple[int, int, int, int]]): Bounding boxes emitted by the matcher.
Returns:
list[tuple[int, int, int, int]]: Grouped rectangles merged by OpenCV's clustering.
"""
rects_arr = np.repeat(np.array(rects), 2, axis=0)
grouped_rects, _ = cv.groupRectangles(rects_arr, groupThreshold=1, eps=0.1) # type: ignore[arg-type]
return cast("list[tuple[int, int, int, int]]", grouped_rects)
[docs]
@check_valid_image
def find_contours(
self: Self,
color: tuple[int, int, int],
rect: tuple[int, int, int, int] | None = None,
tolerance: int = 0,
min_area: int = 10,
vertices: int | None = None,
) -> list[npt.NDArray[np.uintp]]:
"""Finds contours in the image that match the specified color.
Args:
color (tuple[int, int, int]): Target colour expressed as (R, G, B).
rect (tuple[int, int, int, int] | None): Search region specified as
(x, y, width, height). If None, the entire image is used.
tolerance (int): Allowed deviation per colour channel.
min_area (int): Minimum area in pixels squared for a contour to qualify.
vertices (int | None): Required vertex count for returned contours.
Returns:
list[npt.NDArray[np.uintp]]: Contours matching the search criteria.
"""
image = self._crop_image(rect)
image = filter_colors(image, color, tolerance)
contours, _ = cv.findContours(image, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
contours = [c + np.array([rect[0], rect[1]]) if rect else c for c in contours]
contours = [c for c in contours if cv.moments(c)["m00"] != 0 and cv.contourArea(c) >= min_area]
if vertices is not None:
contours = [
c
for c in contours
if vertices == len(cv.approxPolyDP(c, 0.01 * cv.arcLength(c, closed=True), closed=True))
]
return cast("list[npt.NDArray[np.uintp]]", contours)
[docs]
@check_valid_image
def draw_points(
self: Self,
points: Sequence[tuple[int, int]],
color: tuple[int, int, int] = (MAX_COLOR_VALUE, 0, 0),
) -> None:
"""Draws points on the backbuffer image.
Args:
points (Sequence[tuple[int, int]]): Coordinates to mark on the backbuffer.
color (tuple[int, int, int]): Drawing colour (RGB). Defaults to red.
"""
points_arr = np.array(points)
self.opencv_image[points_arr[:, 1], points_arr[:, 0]] = color[::-1]
[docs]
@check_valid_image
def draw_contours(
self: Self,
contours: tuple[tuple[tuple[tuple[int, int]]]],
color: tuple[int, int, int] = (MAX_COLOR_VALUE, 0, 0),
) -> None:
"""Draws contours on the backbuffer image.
Args:
contours (tuple[tuple[tuple[tuple[int, int]]]]): Contour sequences as produced by OpenCV.
color (tuple[int, int, int]): Drawing colour (RGB). Defaults to red.
"""
cv.drawContours(self.opencv_image, contours, -1, color[::-1], 2) # type: ignore[arg-type]
[docs]
@check_valid_image
def draw_circle(
self: Self,
circle: tuple[int, int, int],
color: tuple[int, int, int] = (MAX_COLOR_VALUE, 0, 0),
) -> None:
"""Draws a circle on the backbuffer image.
Args:
circle (tuple[int, int, int]): Circle definition (x, y, radius).
color (tuple[int, int, int]): Drawing colour (RGB). Defaults to red.
"""
x, y, r = circle
cv.circle(self.opencv_image, (x, y), r, color[::-1], 2, cv.LINE_4)
[docs]
@check_valid_image
def draw_rectangle(
self: Self,
rect: tuple[int, int, int, int],
color: tuple[int, int, int] = (MAX_COLOR_VALUE, 0, 0),
) -> None:
"""Draws a rectangle on the backbuffer image.
Args:
rect (tuple[int, int, int, int]): Rectangle specified as (x, y, width, height).
color (tuple[int, int, int]): Drawing colour (RGB). Defaults to red.
"""
x, y, w, h = rect
cv.rectangle(self.opencv_image, (x, y), (x + w, y + h), color[::-1], 2, cv.LINE_4)
[docs]
@check_valid_image
def filter_colors(
self: Self,
colors: tuple[int, int, int] | list[tuple[int, int, int]],
tolerance: int = 0,
*,
keep_original_colors: bool = False,
) -> None:
"""Filters the backbuffer image to retain only specified colors within a given tolerance.
Args:
colors (tuple[int, int, int] | list[tuple[int, int, int]]): Colours to keep while filtering.
tolerance (int): Per-channel tolerance threshold (0-255).
keep_original_colors (bool): When ``True``, retain source colours for matching pixels;
otherwise, update the backbuffer in place.
"""
grey_image = filter_colors(self.opencv_image, colors, tolerance, keep_original_colors=keep_original_colors)
self.opencv_image = grey_image