Skip to content

Commit

Permalink
Merge pull request #373 from openzim/stream_dl
Browse files Browse the repository at this point in the history
Stream files downloads to not exhaust memory
  • Loading branch information
benoit74 committed Aug 12, 2024
2 parents d0d0c6e + efdf780 commit d814c23
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 16 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Add support for uncompressed tar archive in --warcs (#369)

### Fixed

- Stream files downloads to not exhaust memory (#373)

## [2.1.0] - 2024-08-09

### Added
Expand Down
10 changes: 10 additions & 0 deletions src/zimit/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import logging

from zimscraperlib.logging import getLogger

EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
REQUESTS_TIMEOUT = 10

logger = getLogger(name="zimit", level=logging.INFO)
14 changes: 14 additions & 0 deletions src/zimit/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pathlib import Path

import requests

from zimit.constants import REQUESTS_TIMEOUT


def download_file(url: str, fpath: Path):
"""Download file from url to fpath with streaming"""
with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
resp.raise_for_status()
with open(fpath, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)
25 changes: 9 additions & 16 deletions src/zimit/zimit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import atexit
import json
import logging
import re
import shutil
import signal
Expand All @@ -21,19 +20,17 @@

import inotify
import inotify.adapters
import requests
from warc2zim.main import main as warc2zim
from zimscraperlib.logging import getLogger
from zimscraperlib.uri import rebuild_uri

from zimit.__about__ import __version__

EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
REQUESTS_TIMEOUT = 10

logger = getLogger(name="zimit", level=logging.INFO)
from zimit.constants import (
EXIT_CODE_CRAWLER_LIMIT_HIT,
EXIT_CODE_WARC2ZIM_CHECK_FAILED,
NORMAL_WARC2ZIM_EXIT_CODE,
logger,
)
from zimit.utils import download_file


class ProgressFileWatcher:
Expand Down Expand Up @@ -457,9 +454,7 @@ def cleanup():
f"Downloading browser profile from {custom_behavior} "
f"to {behaviors_file.name}"
)
resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT)
resp.raise_for_status()
Path(behaviors_file.name).write_bytes(resp.content)
download_file(custom_behavior, Path(behaviors_file.name))
else:
logger.info(
f"Copying browser profile from {custom_behavior} "
Expand Down Expand Up @@ -552,9 +547,7 @@ def cleanup():
# collisions
warc_file = Path(filename.name)
logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT)
resp.raise_for_status()
warc_file.write_bytes(resp.content)
download_file(warc_location, warc_file)

# if it is a plain warc or warc.gz, simply add it to the list
if suffix in {".warc", ".warc.gz"}:
Expand Down

0 comments on commit d814c23

Please sign in to comment.