From efdf7804c07bb23eb25f5c47e03a820b51277e01 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 12 Aug 2024 19:47:10 +0000 Subject: [PATCH] Stream files downloads to not exhaust memory --- CHANGELOG.md | 4 ++++ src/zimit/constants.py | 10 ++++++++++ src/zimit/utils.py | 14 ++++++++++++++ src/zimit/zimit.py | 25 +++++++++---------------- 4 files changed, 37 insertions(+), 16 deletions(-) create mode 100644 src/zimit/constants.py create mode 100644 src/zimit/utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c34360..7d76305 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add support for uncompressed tar archive in --warcs (#369) +### Fixed + +- Stream files downloads to not exhaust memory (#373) + ## [2.1.0] - 2024-08-09 ### Added diff --git a/src/zimit/constants.py b/src/zimit/constants.py new file mode 100644 index 0000000..f81905a --- /dev/null +++ b/src/zimit/constants.py @@ -0,0 +1,10 @@ +import logging + +from zimscraperlib.logging import getLogger + +EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 +EXIT_CODE_CRAWLER_LIMIT_HIT = 11 +NORMAL_WARC2ZIM_EXIT_CODE = 100 +REQUESTS_TIMEOUT = 10 + +logger = getLogger(name="zimit", level=logging.INFO) diff --git a/src/zimit/utils.py b/src/zimit/utils.py new file mode 100644 index 0000000..f2b78f4 --- /dev/null +++ b/src/zimit/utils.py @@ -0,0 +1,14 @@ +from pathlib import Path + +import requests + +from zimit.constants import REQUESTS_TIMEOUT + + +def download_file(url: str, fpath: Path): + """Download file from url to fpath with streaming""" + with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp: + resp.raise_for_status() + with open(fpath, "wb") as f: + for chunk in resp.iter_content(chunk_size=8192): + f.write(chunk) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 9101747..4e80d07 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -6,7 +6,6 @@ import atexit import json -import logging import re import shutil import signal @@ -21,19 +20,17 @@ import inotify import inotify.adapters -import requests from warc2zim.main import main as warc2zim -from zimscraperlib.logging import getLogger from zimscraperlib.uri import rebuild_uri from zimit.__about__ import __version__ - -EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 -EXIT_CODE_CRAWLER_LIMIT_HIT = 11 -NORMAL_WARC2ZIM_EXIT_CODE = 100 -REQUESTS_TIMEOUT = 10 - -logger = getLogger(name="zimit", level=logging.INFO) +from zimit.constants import ( + EXIT_CODE_CRAWLER_LIMIT_HIT, + EXIT_CODE_WARC2ZIM_CHECK_FAILED, + NORMAL_WARC2ZIM_EXIT_CODE, + logger, +) +from zimit.utils import download_file class ProgressFileWatcher: @@ -457,9 +454,7 @@ def cleanup(): f"Downloading browser profile from {custom_behavior} " f"to {behaviors_file.name}" ) - resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT) - resp.raise_for_status() - Path(behaviors_file.name).write_bytes(resp.content) + download_file(custom_behavior, Path(behaviors_file.name)) else: logger.info( f"Copying browser profile from {custom_behavior} " @@ -552,9 +547,7 @@ def cleanup(): # collisions warc_file = Path(filename.name) logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}") - resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT) - resp.raise_for_status() - warc_file.write_bytes(resp.content) + download_file(warc_location, warc_file) # if it is a plain warc or warc.gz, simply add it to the list if suffix in {".warc", ".warc.gz"}: