Coverage for src/bob/pipelines/dataset/protocols/hashing.py: 70%

23 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2024-07-12 21:32 +0200

1"""Hashing functionalities for verifying files and computing CRCs.""" 

2 

3 

4import hashlib 

5import os 

6 

7from pathlib import Path 

8from typing import Any, Callable, Union 

9 

10 

11def md5_hash(readable: Any, chunk_size: int = 65535) -> str: 

12 """Computes the md5 hash of any object with a read method.""" 

13 hasher = hashlib.md5() 

14 for chunk in iter(lambda: readable.read(chunk_size), b""): 

15 hasher.update(chunk) 

16 return hasher.hexdigest() 

17 

18 

19def sha256_hash(readable: Any, chunk_size: int = 65535) -> str: 

20 """Computes the SHA256 hash of any object with a read method.""" 

21 hasher = hashlib.sha256() 

22 for chunk in iter(lambda: readable.read(chunk_size), b""): 

23 hasher.update(chunk) 

24 return hasher.hexdigest() 

25 

26 

27def verify_file( 

28 file_path: Union[str, os.PathLike], 

29 file_hash: str, 

30 hash_fct: Callable[[Any, int], str] = sha256_hash, 

31 full_match: bool = False, 

32) -> bool: 

33 """Returns True if the file computed hash corresponds to `file_hash`. 

34 

35 For comfort, we allow ``file_hash`` to match with the first 

36 characters of the digest, allowing storing only e.g. the first 8 

37 char. 

38 

39 Parameters 

40 ---------- 

41 file_path 

42 The path to the file needing verification. 

43 file_hash 

44 The expected file hash digest. 

45 hash_fct 

46 A function taking a path and returning a digest. Defaults to SHA256. 

47 full_match 

48 If set to False, allows ``file_hash`` to match the first characters of 

49 the files digest (this allows storing e.g. 8 chars of a digest instead 

50 of the whole 64 characters of SHA256, and still matching.) 

51 """ 

52 file_path = Path(file_path) 

53 with file_path.open("rb") as f: 

54 digest = hash_fct(f, 65535) 

55 return digest == file_hash if full_match else digest.startswith(file_hash) 

56 

57 

58def compute_crc( 

59 file_path: Union[str, os.PathLike], 

60 hash_fct: Callable[[Any, int], str] = sha256_hash, 

61) -> str: 

62 """Returns the CRC of a file.""" 

63 file_path = Path(file_path) 

64 with file_path.open("rb") as f: 

65 return hash_fct(f, 65535)