65 lines
2.1 KiB
Python
65 lines
2.1 KiB
Python
"""Utilities to efficiently compute the SHA 256 hash of a bunch of bytes."""
|
|
|
|
from typing import BinaryIO, Optional
|
|
|
|
from .insecure_hashlib import sha1, sha256
|
|
|
|
|
|
def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes:
|
|
"""
|
|
Computes the sha256 hash of the given file object, by chunks of size `chunk_size`.
|
|
|
|
Args:
|
|
fileobj (file-like object):
|
|
The File object to compute sha256 for, typically obtained with `open(path, "rb")`
|
|
chunk_size (`int`, *optional*):
|
|
The number of bytes to read from `fileobj` at once, defaults to 1MB.
|
|
|
|
Returns:
|
|
`bytes`: `fileobj`'s sha256 hash as bytes
|
|
"""
|
|
chunk_size = chunk_size if chunk_size is not None else 1024 * 1024
|
|
|
|
sha = sha256()
|
|
while True:
|
|
chunk = fileobj.read(chunk_size)
|
|
sha.update(chunk)
|
|
if not chunk:
|
|
break
|
|
return sha.digest()
|
|
|
|
|
|
def git_hash(data: bytes) -> str:
|
|
"""
|
|
Computes the git-sha1 hash of the given bytes, using the same algorithm as git.
|
|
|
|
This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object
|
|
for more details.
|
|
|
|
Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the
|
|
pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of
|
|
the LFS file content when we want to compare LFS files.
|
|
|
|
Args:
|
|
data (`bytes`):
|
|
The data to compute the git-hash for.
|
|
|
|
Returns:
|
|
`str`: the git-hash of `data` as an hexadecimal string.
|
|
|
|
Example:
|
|
```python
|
|
>>> from huggingface_hub.utils.sha import git_hash
|
|
>>> git_hash(b"Hello, World!")
|
|
'b45ef6fec89518d314f546fd6c3025367b721684'
|
|
```
|
|
"""
|
|
# Taken from https://gist.github.com/msabramo/763200
|
|
# Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum).
|
|
sha = sha1()
|
|
sha.update(b"blob ")
|
|
sha.update(str(len(data)).encode())
|
|
sha.update(b"\0")
|
|
sha.update(data)
|
|
return sha.hexdigest()
|