From efdf7804c07bb23eb25f5c47e03a820b51277e01 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 12 Aug 2024 19:47:10 +0000
Subject: [PATCH] Stream files downloads to not exhaust memory

---
 CHANGELOG.md           |  4 ++++
 src/zimit/constants.py | 10 ++++++++++
 src/zimit/utils.py     | 14 ++++++++++++++
 src/zimit/zimit.py     | 25 +++++++++----------------
 4 files changed, 37 insertions(+), 16 deletions(-)
 create mode 100644 src/zimit/constants.py
 create mode 100644 src/zimit/utils.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0c34360..7d76305 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Add support for uncompressed tar archive in --warcs (#369)
 
+### Fixed
+
+- Stream files downloads to not exhaust memory (#373)
+
 ## [2.1.0] - 2024-08-09
 
 ### Added
diff --git a/src/zimit/constants.py b/src/zimit/constants.py
new file mode 100644
index 0000000..f81905a
--- /dev/null
+++ b/src/zimit/constants.py
@@ -0,0 +1,10 @@
+import logging
+
+from zimscraperlib.logging import getLogger
+
+EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
+EXIT_CODE_CRAWLER_LIMIT_HIT = 11
+NORMAL_WARC2ZIM_EXIT_CODE = 100
+REQUESTS_TIMEOUT = 10
+
+logger = getLogger(name="zimit", level=logging.INFO)
diff --git a/src/zimit/utils.py b/src/zimit/utils.py
new file mode 100644
index 0000000..f2b78f4
--- /dev/null
+++ b/src/zimit/utils.py
@@ -0,0 +1,14 @@
+from pathlib import Path
+
+import requests
+
+from zimit.constants import REQUESTS_TIMEOUT
+
+
+def download_file(url: str, fpath: Path):
+    """Download file from url to fpath with streaming"""
+    with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
+        resp.raise_for_status()
+        with open(fpath, "wb") as f:
+            for chunk in resp.iter_content(chunk_size=8192):
+                f.write(chunk)
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index 9101747..4e80d07 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -6,7 +6,6 @@
 
 import atexit
 import json
-import logging
 import re
 import shutil
 import signal
@@ -21,19 +20,17 @@
 
 import inotify
 import inotify.adapters
-import requests
 from warc2zim.main import main as warc2zim
-from zimscraperlib.logging import getLogger
 from zimscraperlib.uri import rebuild_uri
 
 from zimit.__about__ import __version__
-
-EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
-EXIT_CODE_CRAWLER_LIMIT_HIT = 11
-NORMAL_WARC2ZIM_EXIT_CODE = 100
-REQUESTS_TIMEOUT = 10
-
-logger = getLogger(name="zimit", level=logging.INFO)
+from zimit.constants import (
+    EXIT_CODE_CRAWLER_LIMIT_HIT,
+    EXIT_CODE_WARC2ZIM_CHECK_FAILED,
+    NORMAL_WARC2ZIM_EXIT_CODE,
+    logger,
+)
+from zimit.utils import download_file
 
 
 class ProgressFileWatcher:
@@ -457,9 +454,7 @@ def cleanup():
                     f"Downloading browser profile from {custom_behavior} "
                     f"to {behaviors_file.name}"
                 )
-                resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT)
-                resp.raise_for_status()
-                Path(behaviors_file.name).write_bytes(resp.content)
+                download_file(custom_behavior, Path(behaviors_file.name))
             else:
                 logger.info(
                     f"Copying browser profile from {custom_behavior} "
@@ -552,9 +547,7 @@ def cleanup():
             # collisions
             warc_file = Path(filename.name)
             logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
-            resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT)
-            resp.raise_for_status()
-            warc_file.write_bytes(resp.content)
+            download_file(warc_location, warc_file)
 
             # if it is a plain warc or warc.gz, simply add it to the list
             if suffix in {".warc", ".warc.gz"}: