PhotosAPI/modules/hasher.py

from pathlib import Path
from typing import Union

import cv2
import numpy as np
from numpy.typing import NDArray
from scipy import spatial

from modules.database import col_photos


def hash_array_to_hash_hex(hash_array):
    # convert hash array of 0 or 1 to hash string in hex
    hash_array = np.array(hash_array, dtype=np.uint8)
    hash_str = "".join(str(i) for i in 1 * hash_array.flatten())
    return hex(int(hash_str, 2))


def hash_hex_to_hash_array(hash_hex) -> NDArray:
    # convert hash string in hex to hash values of 0 or 1
    hash_str = int(hash_hex, 16)
    array_str = bin(hash_str)[2:]
    return np.array(list(array_str), dtype=np.float32)


def get_duplicates_cache(album: str) -> dict:
    return {
        photo["filename"]: [photo["_id"].__str__(), photo["hash"]]
        for photo in col_photos.find({"album": album})
    }


async def get_phash(filepath: Union[str, Path]) -> str:
    img = cv2.imread(str(filepath))
    # resize image and convert to gray scale
    img = cv2.resize(img, (64, 64))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = np.array(img, dtype=np.float32)
    # calculate dct of image
    dct = cv2.dct(img)
    # to reduce hash length take only 8*8 top-left block
    # as this block has more information than the rest
    dct_block = dct[:8, :8]
    # caclulate mean of dct block excluding first term i.e, dct(0, 0)
    dct_average = (dct_block.mean() * dct_block.size - dct_block[0, 0]) / (
        dct_block.size - 1
    )
    # convert dct block to binary values based on dct_average
    dct_block[dct_block < dct_average] = 0.0
    dct_block[dct_block != 0] = 1.0
    # store hash value
    return hash_array_to_hash_hex(dct_block.flatten())


async def get_duplicates(hash_string: str, album: str) -> list:
    duplicates = []
    cache = get_duplicates_cache(album)
    for image_name, image_object in cache.items():
        try:
            distance = spatial.distance.hamming(
                hash_hex_to_hash_array(cache[image_name][1]),
                hash_hex_to_hash_array(hash_string),
            )
        except ValueError:
            continue
        # print("{0:<30} {1}".format(image_name, distance), flush=True)
        if distance <= 0.1:
            duplicates.append(
                {
                    "id": cache[image_name][0],
                    "filename": image_name,
                    "difference": distance,
                }
            )
    return duplicates
Fixed path error 2023-06-23 13:09:36 +03:00			`from pathlib import Path`
			`from typing import Union`

Sorted imports with isort 2023-06-22 14:17:53 +03:00			`import cv2`
Migrated from main API 2022-12-20 02:22:32 +02:00			`import numpy as np`
			`from numpy.typing import NDArray`
			`from scipy import spatial`
Sorted imports with isort 2023-06-22 14:17:53 +03:00
			`from modules.database import col_photos`
Migrated from main API 2022-12-20 02:22:32 +02:00
Formatted everything with black 2023-03-12 15:59:13 +02:00
Migrated from main API 2022-12-20 02:22:32 +02:00			`def hash_array_to_hash_hex(hash_array):`
Formatted everything with black 2023-03-12 15:59:13 +02:00			`# convert hash array of 0 or 1 to hash string in hex`
			`hash_array = np.array(hash_array, dtype=np.uint8)`
			`hash_str = "".join(str(i) for i in 1 * hash_array.flatten())`
			`return hex(int(hash_str, 2))`

Migrated from main API 2022-12-20 02:22:32 +02:00
			`def hash_hex_to_hash_array(hash_hex) -> NDArray:`
Formatted everything with black 2023-03-12 15:59:13 +02:00			`# convert hash string in hex to hash values of 0 or 1`
			`hash_str = int(hash_hex, 16)`
			`array_str = bin(hash_str)[2:]`
Refactor changed are done 2023-06-23 13:17:01 +03:00			`return np.array(list(array_str), dtype=np.float32)`
Formatted everything with black 2023-03-12 15:59:13 +02:00
Migrated from main API 2022-12-20 02:22:32 +02:00
			`def get_duplicates_cache(album: str) -> dict:`
Refactor changed are done 2023-06-23 13:17:01 +03:00			`return {`
			`photo["filename"]: [photo["_id"].__str__(), photo["hash"]]`
			`for photo in col_photos.find({"album": album})`
			`}`
Migrated from main API 2022-12-20 02:22:32 +02:00
Formatted everything with black 2023-03-12 15:59:13 +02:00
Fixed path error 2023-06-23 13:09:36 +03:00			`async def get_phash(filepath: Union[str, Path]) -> str:`
			`img = cv2.imread(str(filepath))`
Migrated from main API 2022-12-20 02:22:32 +02:00			`# resize image and convert to gray scale`
			`img = cv2.resize(img, (64, 64))`
			`img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)`
Formatted everything with black 2023-03-12 15:59:13 +02:00			`img = np.array(img, dtype=np.float32)`
			`# calculate dct of image`
Migrated from main API 2022-12-20 02:22:32 +02:00			`dct = cv2.dct(img)`
Formatted everything with black 2023-03-12 15:59:13 +02:00			`# to reduce hash length take only 8*8 top-left block`
Migrated from main API 2022-12-20 02:22:32 +02:00			`# as this block has more information than the rest`
Formatted everything with black 2023-03-12 15:59:13 +02:00			`dct_block = dct[:8, :8]`
Migrated from main API 2022-12-20 02:22:32 +02:00			`# caclulate mean of dct block excluding first term i.e, dct(0, 0)`
Formatted everything with black 2023-03-12 15:59:13 +02:00			`dct_average = (dct_block.mean() * dct_block.size - dct_block[0, 0]) / (`
			`dct_block.size - 1`
			`)`
Migrated from main API 2022-12-20 02:22:32 +02:00			`# convert dct block to binary values based on dct_average`
			`dct_block[dct_block < dct_average] = 0.0`
			`dct_block[dct_block != 0] = 1.0`
			`# store hash value`
			`return hash_array_to_hash_hex(dct_block.flatten())`

Formatted everything with black 2023-03-12 15:59:13 +02:00
Fixed path error 2023-06-23 13:09:36 +03:00			`async def get_duplicates(hash_string: str, album: str) -> list:`
Migrated from main API 2022-12-20 02:22:32 +02:00			`duplicates = []`
			`cache = get_duplicates_cache(album)`
Refactor changed are done 2023-06-23 13:17:01 +03:00			`for image_name, image_object in cache.items():`
Added raise for incorrect geo exif 2023-02-14 15:32:20 +02:00			`try:`
			`distance = spatial.distance.hamming(`
Formatted everything with black 2023-03-12 15:59:13 +02:00			`hash_hex_to_hash_array(cache[image_name][1]),`
Fixed path error 2023-06-23 13:09:36 +03:00			`hash_hex_to_hash_array(hash_string),`
Added raise for incorrect geo exif 2023-02-14 15:32:20 +02:00			`)`
			`except ValueError:`
			`continue`
Changed hashing behavior 2023-02-15 17:08:01 +02:00			`# print("{0:<30} {1}".format(image_name, distance), flush=True)`
			`if distance <= 0.1:`
Formatted everything with black 2023-03-12 15:59:13 +02:00			`duplicates.append(`
			`{`
			`"id": cache[image_name][0],`
			`"filename": image_name,`
			`"difference": distance,`
			`}`
			`)`
			`return duplicates`