Skip to content

Commit

Permalink
Lookup up files by size or size and hash
Browse files Browse the repository at this point in the history
  • Loading branch information
marcpage authored and pagerk committed Aug 31, 2024
1 parent 0e7600a commit 76dd166
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 0 deletions.
34 changes: 34 additions & 0 deletions genweb/inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,40 @@ def hash(self, path: str) -> str:

return entry.hash

def _populate_all_stats(self):
"""42,787 files in 0.05 - 0.06 seconds on MacBook Pro M2"""
no_size = [p for p, i in self.inventory.items() if i.size is None]

for path in no_size:
Artifacts._update_stat(self.inventory[path], join(self.directory, path))

def get_files_of_size(self, size: int) -> list[str]:
"""Finds all files with a given file size
Args:
size (int): The number of bytes in the file(s) we're looking for
Returns:
list[str]: The list of relative paths to the files with that size
"""
self._populate_all_stats()
return [p for p, i in self.inventory.items() if i.size == size]

def lookup_hashes(self, hash_sizes: dict[str, int]) -> dict[str, list[str]]:
"""Given a list of hashes and the file size it represents, get the list of paths.
The size is an optimization to prevent the need to hash every file.
Args:
hash_sizes (dict[str, int]): Map of hash to filesize
Returns:
dict[str, list[str]]: Map of hash to list of relative paths that match that hash
"""
return {
h: [p for p in self.get_files_of_size(s) if self.hash(p) == h]
for h, s in hash_sizes.items()
}

def refresh(self) -> None:
"""Looks for new files in the artifacts directory"""
all_files = {
Expand Down
30 changes: 30 additions & 0 deletions tests/test_inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,38 @@ def test_hash() -> None:
assert artifacts.hash("file1.txt") == hash_empty, artifacts.hash("file1.txt")


def test_get_files_of_size() -> None:
artifacts = Artifacts(dirname(__file__))
files = artifacts.get_files_of_size(19)
assert "data/fake.jpg" in files, files


def test_lookup_hashes() -> None:
with TemporaryDirectory() as working_dir:
artifacts = Artifacts(dirname(__file__), cache_dir=working_dir)
search = {
"109bc4102df941d25c700824514023fc0f7ece6a1d389e3133046ae6f270793e": 19,
"83a74d057cb0648281a004a6b70f2824a08a8b818c194a5282317f946191b603": 22,
}
results = artifacts.lookup_hashes(search)
assert (
"data/fake.jpg"
in results[
"109bc4102df941d25c700824514023fc0f7ece6a1d389e3133046ae6f270793e"
]
), results
assert (
"data/fake2.jpg"
in results[
"83a74d057cb0648281a004a6b70f2824a08a8b818c194a5282317f946191b603"
]
), results


if __name__ == "__main__":
test_basic()
test_suffixed()
test_add()
test_hash()
test_get_files_of_size()
test_lookup_hashes()

0 comments on commit 76dd166

Please sign in to comment.