diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..7e5385f --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +end_of_line = lf +insert_final_newline = true +charset = utf-8 +indent_style = space +indent_size = 4 +trim_trailing_whitespace = true + +[*.yaml] +indent_size = 2 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..1d53b70 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +*.json linguist-generated +*.xml linguist-generated +*.tsv linguist-generated +*.csv linguist-generated diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..dd90a2b --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,23 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for more information: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +# https://containers.dev/guide/dependabot + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" + # Enable version updates for Actions + - package-ecosystem: "github-actions" + # Look for `.github/workflows` in the `root` directory + directory: "/" + # Check for updates once a week + schedule: + interval: "daily" + - package-ecosystem: "devcontainers" + directory: "/" + schedule: + interval: weekly \ No newline at end of file diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml new file mode 100644 index 0000000..59a09dd --- /dev/null +++ b/.github/workflows/scrape.yml @@ -0,0 +1,44 @@ +name: "Scrape platform" +on: + workflow_dispatch: + push: + paths: + - "scraper/*" + schedule: + - cron: "0 12 * * 0" +permissions: + contents: write +jobs: + Scrape: + name: "Scrape platform and convert as Media object" + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: "Load secrets, if any" + uses: oNaiPs/secrets-to-env-action@v1 + with: + secrets: ${{ toJson(secrets) }} + - name: "Setup python" + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + - name: "Install dependencies" + run: pip install -U -r requirements.txt + - name: "Run scraper" + shell: pwsh + run: | + try { + python3 scraper + $date = Get-Date -Format "yyyy-MM-dd'T'HH:mm:sszzzz" + git config --local user.email "167072439+rensetsu-bot@users.noreply.github.com" + git config --local user.name "Rensetsu[bot]" + git add . + git commit -m "Scrape data, $date update" + git push + } + catch { + "Failed to commit, check logs!" + exit 1 + } diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..68bc17f --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..5d21c81 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "charliermarsh.ruff" + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ee3ed90 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.analysis.typeCheckingMode": "strict", + "python.analysis.autoImportCompletions": true, + "python.analysis.diagnosticSeverityOverrides": { + "reportMissingTypeStubs": "none" + } +} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..df624ee --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Rensetsu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..eafdfae --- /dev/null +++ b/README.md @@ -0,0 +1,30 @@ +# Rensetsu Service Scraper for {PLATFORM} + +This is a command-line utility to scrape {PLATFORM} data and convert it to +Rensetsu Media object format + +## Requirements + +* Python >= 3.10 + +## Setup + +Simply running following commands before using the app: + +```sh +git clone https://github.com/rensetsu/{REPO_NAME} +cd {REPO_NAME} +python3 -m venv venv +source ./venv/bin/activate +pip install -U -r requirements.txt +``` + +## Usage + +```sh +python scraper +``` + +## License + +This repo is licensed under [MIT License](LICENSE), unless stated otherwise. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7f7d3fc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[project] +requires-python = ">=3.10" +name = "rensetsu" + +[tool.ruff] +line-length = 80 + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a5f1221 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/rensetsu/librensetsu.git diff --git a/scraper/__init__.py b/scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scraper/__main__.py b/scraper/__main__.py new file mode 100644 index 0000000..13e625d --- /dev/null +++ b/scraper/__main__.py @@ -0,0 +1,29 @@ +from sys import exit as sysexit +from time import time +from traceback import print_exc + +from consts import Status, pprint +from librensetsu.humanclock import convert_float_to_time +from loops import do_loop + + +def main() -> None: + """Main process of the util""" + start = time() + ex = 0 + pprint.print(Status.INFO, "Starting...") + try: + do_loop() + except Exception as e: + pprint.print(Status.ERR, f"An error occurred: {e}") + print_exc() + ex = 1 + end = time() + pprint.print( + Status.INFO, f"Time elapsed: {convert_float_to_time(end - start)}" + ) + sysexit(ex) + + +if __name__ == "__main__": + main() diff --git a/scraper/consts.py b/scraper/consts.py new file mode 100644 index 0000000..ee6205b --- /dev/null +++ b/scraper/consts.py @@ -0,0 +1,9 @@ +from librensetsu.prettyprint import PrettyPrint, Platform, Status + +pprint = PrettyPrint(Platform.SYSTEM) +PLATFORM = "" +RAW_DB = f"{PLATFORM}_raw.json" +DEST = f"{PLATFORM}.json" +DESTM = f"{PLATFORM}_min.json" + +__all__ = ['pprint', 'Platform', 'Status', 'RAW_DB', 'DEST', 'DESTM'] diff --git a/scraper/loops.py b/scraper/loops.py new file mode 100644 index 0000000..588c9c8 --- /dev/null +++ b/scraper/loops.py @@ -0,0 +1,97 @@ +from copy import deepcopy +from dataclasses import asdict +from json import dump, loads +from typing import Any +from uuid import uuid4 + +from alive_progress import alive_bar as abr +from consts import DEST, DESTM, RAW_DB, Status, pprint +from dacite import from_dict +from librensetsu.formatter import remove_empty_keys +from librensetsu.models import Date, MediaInfo, RelationMaps + + +def process_item(item: Any, old_uuid: str | None = None) -> MediaInfo: + """ + Process the item to MediaInfo object + :param item: The item to process + :type item: Any + :param old_uuid: The old UUID to replace + :type old_uuid: str | None + :return: The processed item + :rtype: MediaInfo + """ + # import logic here + return MediaInfo( + uuid=old_uuid or str(uuid4()), + ) + + +def lookup_uuid(items: list[Any], old_items: list[Any]) -> list[MediaInfo]: + """ + Lookup the UUID of the items + :param items: The items to lookup + :type items: list[Any] + :param old_items: The old items to lookup + :type old_items: list[Any] + :param lookup_key: The lookup key + :type lookup_key: str + :return: The list of MediaInfo + :rtype: list[MediaInfo] + """ + loi = len(old_items) + new_data: list[MediaInfo] = [] + pprint.print(Status.INFO, "Processing items") + with abr(total=len(items)) as bar: # type: ignore + for item in items: + bar() + media_id = item.media_id + # Edit ^^^^^^^^^ + uuid = None + if loi > 0: + for old_item in old_items: + if media_id == old_item["mappings"][""]: + # Edit ^^^^ + uuid = old_item["uuid"] + break + final = process_item(item, uuid) + new_data.append(final) + return new_data + + +def dump_json(final_data: list[dict[str, Any]]) -> None: + """ + Dump the JSON file + :param final_data: The final data to dump + :type final_data: list[dict[str, Any]] + """ + pprint.print(Status.INFO, f"Dumping JSON file to {DEST}") + with open(DEST, "w") as f: + dump(final_data, f, ensure_ascii=False) + pprint.print(Status.INFO, f"Dumping Minified JSON file to {DESTM}") + mininfo = remove_empty_keys(deepcopy(final_data)) # type: ignore + with open(DESTM, "w") as f: + dump(mininfo, f, ensure_ascii=False) + + +def do_loop() -> list[MediaInfo]: + """ + Loops all the object to convert as a list of MediaInfo + :return: List of `MediaInfo` + :rtype: MediaInfo + """ + try: + with open(DEST, "r") as f: + old_data: list[dict[str, Any]] = loads(f.read()) + except FileNotFoundError: + old_data = [] + with open(RAW_DB, "r") as f: + data = loads(f.read()) + final_data = lookup_uuid(data, old_data) + new_data = [asdict(x) for x in final_data] + # sort + pprint.print(Status.INFO, "Sorting data") + new_data.sort(key=lambda x: x["mappings"][""]) # type: ignore + # EDIT ^^^^ + dump_json(new_data) + return final_data