diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 447c1eb7..51b13b90 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -27,6 +27,7 @@ jobs: - name: Install sec-certs run: | pip install -e . + python -m spacy download en_core_web_sm - name: Run tests run: pytest --cov=sec_certs tests - name: Code coverage upload diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 82687161..68f2ec14 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,13 +35,10 @@ Note on single-sourcing the package version: More can be read [here](https://pac ### Currently, the release process is as follows 1. Update dependencies with `pre-commit autoupdate`, pin new versions of linters into `pyproject.toml`. -2. Run `cd requirements && ./compile.sh` to update dependencies. -3. Use `python -m spacy download en_core_web_sm` to find out the current version of `en_core_web_sm` dependency. Update pyproject.toml link of `en_core_web_sm` dependency with up-to-date link from [GitHub](https://github.com/explosion/spacy-models/releases). -4. Run `cd requirements && ./compile.sh` **again** to update dependencies. +2. Run `cd requirements && ./compile.sh` to update dependencies, commit the changes. 5. Create a release from GitHub UI. Include release notes, add proper version tag and publish the release (or create it from scratch with new tag). 6. This will automatically update PyPi and DockerHub packages. - ## Quality assurance All commits shall pass the lint pipeline of the following tools: diff --git a/Dockerfile b/Dockerfile index 53a5a75f..e0040227 100644 --- a/Dockerfile +++ b/Dockerfile @@ -58,7 +58,8 @@ RUN \ pip3 install -U pip wheel pip-tools && \ pip-sync requirements/requirements.txt && \ pip3 install --no-cache notebook jupyterlab && \ - pip3 install -e . + pip3 install -e . && \ + python -m spacy download en_core_web_sm # just to be sure that pdftotext is in $PATH ENV PATH /usr/bin/pdftotext:${PATH} diff --git a/docs/installation.md b/docs/installation.md index e62149e6..d489b03c 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -6,7 +6,7 @@ The tool can be installed from PyPi with ```bash -pip install -U sec-certs +pip install -U sec-certs && python -m spacy download en_core_web_sm ``` Note, that `Python>=3.10` is required. @@ -31,6 +31,7 @@ git clone https://github.com/crocs-muni/sec-certs.git python3 -m venv venv source venv/bin/activate pip install -e . +python -m spacy download en_core_web_sm ``` Alternatively, our Our [Dockerfile](https://github.com/crocs-muni/sec-certs/blob/main/Dockerfile) represents a reproducible way of setting up the environment. diff --git a/docs/quickstart.md b/docs/quickstart.md index 49731e88..f2e3690f 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -3,7 +3,7 @@ ::::{tab-set} :::{tab-item} Common Criteria -1. Install the latest version with `pip install -U sec-certs` (see [installation](installation.md)). +1. Install the latest version with `pip install -U sec-certs && python -m spacy download en_core_web_sm` (see [installation](installation.md)). 2. In your Python interpreter, type ```python from sec_certs.dataset.cc import CCDataset @@ -16,7 +16,7 @@ to obtain to obtain freshly processed dataset from [seccerts.org](https://seccer ::: :::{tab-item} FIPS 140 -1. Install the latest version with `pip install -U sec-certs` (see [installation](installation.md)). +1. Install the latest version with `pip install -U sec-certs && python -m spacy download en_core_web_sm` (see [installation](installation.md)). 2. In your Python interpreter, type ```python from sec_certs.dataset.fips import FIPSDataset diff --git a/pyproject.toml b/pyproject.toml index ae19a5c0..e1d4397c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,6 @@ "ipykernel", "ipywidgets", "spacy", - "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl", "pkgconfig", "seaborn", "pySankeyBeta", diff --git a/requirements/all_requirements.txt b/requirements/all_requirements.txt index 7a836c31..4424f3ac 100644 --- a/requirements/all_requirements.txt +++ b/requirements/all_requirements.txt @@ -130,8 +130,6 @@ docutils==0.19 # myst-parser # pydata-sphinx-theme # sphinx -en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl - # via sec-certs (./../pyproject.toml) evaluate==0.4.1 # via setfit executing==2.0.1 @@ -644,9 +642,7 @@ snowballstemmer==2.2.0 soupsieve==2.5 # via beautifulsoup4 spacy==3.7.2 - # via - # en-core-web-sm - # sec-certs (./../pyproject.toml) + # via sec-certs (./../pyproject.toml) spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 diff --git a/requirements/dev_requirements.txt b/requirements/dev_requirements.txt index 1b028276..82951940 100644 --- a/requirements/dev_requirements.txt +++ b/requirements/dev_requirements.txt @@ -99,8 +99,6 @@ docutils==0.19 # myst-parser # pydata-sphinx-theme # sphinx -en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl - # via sec-certs (./../pyproject.toml) executing==2.0.1 # via stack-data fastjsonschema==2.19.0 @@ -449,9 +447,7 @@ snowballstemmer==2.2.0 soupsieve==2.5 # via beautifulsoup4 spacy==3.7.2 - # via - # en-core-web-sm - # sec-certs (./../pyproject.toml) + # via sec-certs (./../pyproject.toml) spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 diff --git a/requirements/nlp_requirements.txt b/requirements/nlp_requirements.txt index 1445a3e2..3d96efa3 100644 --- a/requirements/nlp_requirements.txt +++ b/requirements/nlp_requirements.txt @@ -101,8 +101,6 @@ dill==0.3.7 # multiprocess distro==1.8.0 # via tabula-py -en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl - # via sec-certs (./../pyproject.toml) evaluate==0.4.1 # via setfit executing==2.0.1 @@ -518,9 +516,7 @@ smart-open==6.4.0 soupsieve==2.5 # via beautifulsoup4 spacy==3.7.2 - # via - # en-core-web-sm - # sec-certs (./../pyproject.toml) + # via sec-certs (./../pyproject.toml) spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 7f3b639c..209dc0ee 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -11,7 +11,7 @@ attrs==23.1.0 # jsonschema # referencing beautifulsoup4==4.12.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) blis==0.7.11 # via thinc catalogue==2.0.10 @@ -27,7 +27,7 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # typer cloudpathlib==0.16.0 # via weasel @@ -58,24 +58,22 @@ deprecated==1.2.14 # via pikepdf distro==1.8.0 # via tabula-py -en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl - # via sec-certs (../pyproject.toml) executing==2.0.1 # via stack-data fonttools==4.45.0 # via matplotlib html5lib==1.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) idna==3.4 # via requests ipykernel==6.27.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) ipython==8.17.2 # via # ipykernel # ipywidgets ipywidgets==8.1.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) jedi==0.19.1 # via ipython jinja2==3.1.3 @@ -83,7 +81,7 @@ jinja2==3.1.3 joblib==1.3.2 # via scikit-learn jsonschema==4.20.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) jsonschema-specifications==2023.11.1 # via jsonschema jupyter-client==8.6.0 @@ -101,14 +99,14 @@ langcodes==3.3.0 lxml==4.9.3 # via # pikepdf - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) markupsafe==2.1.3 # via jinja2 matplotlib==3.8.2 # via # pysankeybeta # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) matplotlib-inline==0.1.6 # via # ipykernel @@ -121,7 +119,7 @@ murmurhash==1.0.10 nest-asyncio==1.5.8 # via ipykernel networkx==3.2.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) numpy==1.26.2 # via # blis @@ -132,7 +130,7 @@ numpy==1.26.2 # scikit-learn # scipy # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # tabula-py # thinc @@ -150,24 +148,24 @@ pandas==2.1.3 # via # pysankeybeta # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # tabula-py parso==0.8.3 # via jedi pdftotext==2.2.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pexpect==4.8.0 # via ipython pikepdf==8.7.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pillow==10.2.0 # via # matplotlib # pikepdf # pytesseract - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) pkgconfig==1.5.5 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) platformdirs==4.0.0 # via jupyter-core preshed==3.0.9 @@ -179,7 +177,7 @@ prompt-toolkit==3.0.41 psutil==5.9.6 # via # ipykernel - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) ptyprocess==0.7.0 # via pexpect pure-eval==0.2.2 @@ -190,49 +188,51 @@ pydantic==2.5.2 # via # confection # pydantic-settings - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # thinc # weasel pydantic-core==2.14.5 # via pydantic pydantic-settings==2.1.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pygments==2.17.2 # via ipython pyparsing==3.1.1 # via matplotlib pypdf[crypto]==3.17.1 - # via sec-certs (../pyproject.toml) + # via + # pypdf + # sec-certs (./../pyproject.toml) pysankeybeta==1.4.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pytesseract==0.3.10 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) python-dateutil==2.8.2 # via # jupyter-client # matplotlib # pandas - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) python-dotenv==1.0.0 # via pydantic-settings pytz==2023.3.post1 # via pandas pyyaml==6.0.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pyzmq==25.1.1 # via # ipykernel # jupyter-client rapidfuzz==3.5.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) referencing==0.31.0 # via # jsonschema # jsonschema-specifications requests==2.31.0 # via - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # weasel rpds-py==0.13.1 @@ -240,17 +240,17 @@ rpds-py==0.13.1 # jsonschema # referencing scikit-learn==1.3.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) scipy==1.11.4 # via # scikit-learn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) seaborn==0.13.0 # via # pysankeybeta - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) setuptools-scm==8.0.4 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) six==1.16.0 # via # asttokens @@ -263,9 +263,7 @@ smart-open==6.4.0 soupsieve==2.5 # via beautifulsoup4 spacy==3.7.2 - # via - # en-core-web-sm - # sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 @@ -279,7 +277,7 @@ srsly==2.4.8 stack-data==0.6.3 # via ipython tabula-py==2.9.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) thinc==8.2.1 # via spacy threadpoolctl==3.2.0 @@ -290,7 +288,7 @@ tornado==6.3.3 # jupyter-client tqdm==4.66.1 # via - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy traitlets==5.13.0 # via diff --git a/requirements/test_requirements.txt b/requirements/test_requirements.txt index 96f02e88..f3e76cd7 100644 --- a/requirements/test_requirements.txt +++ b/requirements/test_requirements.txt @@ -62,8 +62,6 @@ deprecated==1.2.14 # via pikepdf distro==1.8.0 # via tabula-py -en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl - # via sec-certs (./../pyproject.toml) executing==2.0.1 # via stack-data fonttools==4.45.0 @@ -280,9 +278,7 @@ smart-open==6.4.0 soupsieve==2.5 # via beautifulsoup4 spacy==3.7.2 - # via - # en-core-web-sm - # sec-certs (./../pyproject.toml) + # via sec-certs (./../pyproject.toml) spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5