diff --git a/.github/workflows/build-push-dev-image.yml b/.github/workflows/build-push-dev-image.yml new file mode 100644 index 00000000..13f8cfb7 --- /dev/null +++ b/.github/workflows/build-push-dev-image.yml @@ -0,0 +1,86 @@ +# Workflow responsible for the +# development release processes. +# +name: Build-Push-Dev-Image +on: + push: + branches: + - develop + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + # Do not build another image on a pull request. + # Any push to develop will trigger a new build however. + pull_request: + branches-ignore: + - '*' + +jobs: + build-push-dev-image: + runs-on: ubuntu-latest + steps: + + - name: Checkout Code + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + # fetch-depth: 0 means, get all branches and commits + fetch-depth: 0 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + # Docker Buildx is important to caching in the Build And Push Container + # step + # https://github.com/marketplace/actions/build-and-push-docker-images + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Login to Container Registry + uses: docker/login-action@v3 + with: + registry: containers.renci.org + username: ${{ secrets.CONTAINERHUB_USERNAME }} + password: ${{ secrets.CONTAINERHUB_TOKEN }} + logout: true + + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Push Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + # Push to renci-registry and dockerhub here. + # cache comes from dockerhub. + tags: | + ${{ github.repository }}:develop + ${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + containers.renci.org/${{ github.repository }}:develop + containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache-dev + cache-to: type=registry,ref=${{ github.repository }}:buildcache-dev,mode=max \ No newline at end of file diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml new file mode 100644 index 00000000..07b22d21 --- /dev/null +++ b/.github/workflows/build-push-release.yml @@ -0,0 +1,131 @@ +# Workflow responsible for the +# major release processes. +# + +name: Build-Push-Release +on: + push: + branches: + - master + - main + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + tags-ignore: + - '*' +jobs: + build-push-release: + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + fetch-depth: 0 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + # https://github.com/marketplace/actions/git-semantic-version + - name: Semver Check + uses: paulhatch/semantic-version@v5.0.3 + id: version + with: + # The prefix to use to identify tags + tag_prefix: "v" + # A string which, if present in a git commit, indicates that a change represents a + # major (breaking) change, supports regular expressions wrapped with '/' + major_pattern: "/breaking:|major:/" + # A string which indicates the flags used by the `major_pattern` regular expression. Supported flags: idgs + major_regexp_flags: "ig" + # Same as above except indicating a minor change, supports regular expressions wrapped with '/' + minor_pattern: "/feat:|feature:|minor:/" + # A string which indicates the flags used by the `minor_pattern` regular expression. Supported flags: idgs + minor_regexp_flags: "ig" + # A string to determine the format of the version output + # version_format: "${major}.${minor}.${patch}-prerelease${increment}" + version_format: "${major}.${minor}.${patch}" + search_commit_body: false + + # Docker Buildx is important to caching in the Build And Push Container + # step + # https://github.com/marketplace/actions/build-and-push-docker-images + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Login to Container Registry + uses: docker/login-action@v3 + with: + registry: containers.renci.org + username: ${{ secrets.CONTAINERHUB_USERNAME }} + password: ${{ secrets.CONTAINERHUB_TOKEN }} + logout: true + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Push Container + uses: docker/build-push-action@v5 + with: + push: true + # Push to renci-registry and dockerhub here. + # cache comes from dockerhub. + tags: | + containers.renci.org/${{ github.repository }}:v${{ steps.version.outputs.version }} + containers.renci.org/${{ github.repository }}:latest + containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + ${{ github.repository }}:v${{ steps.version.outputs.version }} + ${{ github.repository }}:latest + ${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache-release + cache-to: type=registry,ref=${{ github.repository }}:buildcache-release,mode=max + +#==========================TAG & RELEASE W/ NOTES ========================= + + # Note: GITHUB_TOKEN is autogenerated feature of github app + # which is auto-enabled when using github actions. + # https://docs.github.com/en/actions/security-guides/automatic-token-authentication + # https://docs.github.com/en/rest/git/tags?apiVersion=2022-11-28#create-a-tag-object + # https://docs.github.com/en/rest/git/refs?apiVersion=2022-11-28#create-a-reference + # This creates a "lightweight" ref tag. + - name: Create Tag for Release + run: | + curl \ + -s --fail -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/${{ github.repository }}/git/refs \ + -d '{"ref":"refs/tags/v${{ steps.version.outputs.version }}","sha":"${{ github.sha }}"}' + +# https://cli.github.com/manual/gh_release_create + - name: Create Release + env: + RELEASE_VERSION: ${{ steps.version.outputs.version }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release create ${{ env.RELEASE_VERSION }} \ + -t "${{ env.RELEASE_VERSION }}" \ + --generate-notes \ + --latest \ No newline at end of file diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml new file mode 100644 index 00000000..b7f3e6a5 --- /dev/null +++ b/.github/workflows/code-checks.yml @@ -0,0 +1,129 @@ +# Workflow responsible for core acceptance testing. +# Tests Currently Run: +# - flake8-linter +# - PYTest +# - Bandit +# For PR Vulnerability Scanning a separate workflow will run. +# The build-push-dev-image and build-push-release workflows +# handle the develop and release image storage respectively. +# +# + +name: Code-Checks +on: + push: + branches-ignore: + - master + - main + - develop + pull_request: + branches: + - develop + - master + - main + types: [opened, synchronize] + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + +jobs: + ############################## flake8-linter ############################## + flake8-linter: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12" + + # Currently actions/setup-python supports caching + # but the cache is not as robust as cache action. + # Here we cache the entire python env which speeds subsequent builds up alot. (alot being scientific term) + # Ref: https://blog.allenai.org/python-caching-in-github-actions-e9452698e98d + - uses: actions/cache@v3 + name: Cache Python + with: + path: ${{ env.pythonLocation }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pyproject.toml') }} + + - name: Install Requirements + run: | + pip install -r requirements.txt + + - name: Lint with flake8 + run: | + pip install flake8 + flake8 --ignore=E,W dags + # We continue on error here until the code is clean + # flake8 --ignore=E,W --exit-zero . + continue-on-error: true + + ################################### PYTEST ################################### + # pytest: + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v3 + # - name: Set up Python + # uses: actions/setup-python@v4 + # with: + # python-version: '3.12' + + # - name: Install Requirements + # run: | + # pip install -r requirements.txt + # pip install coverage + # pip install ./tests + + # - name: Test with pytest + # run: | + # make test + ############################## test-image-build ############################## + test-image-build: + runs-on: ubuntu-latest + # if: ${{ github.actor == 'dependabot[bot]' }} + steps: + - uses: actions/checkout@v3 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Parse Github Reference Name + id: branch + run: | + REF=${{ github.ref_name }} + echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: | + ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml new file mode 100644 index 00000000..1e7bc060 --- /dev/null +++ b/.github/workflows/trivy-pr-scan.yml @@ -0,0 +1,67 @@ +name: trivy-pr-scan +on: + pull_request: + branches: + - develop + - master + - main + types: [ opened, synchronize ] + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + +jobs: + trivy-pr-scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: false + load: true + tags: ${{ github.repository }}:vuln-test + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + + # We will not be concerned with Medium and Low vulnerabilities + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: '${{ github.repository }}:vuln-test' + format: 'sarif' + severity: 'CRITICAL,HIGH' + ignore-unfixed: true + output: 'trivy-results.sarif' + exit-code: '1' + # Scan results should be viewable in GitHub Security Dashboard + # We still fail the job if results are found, so below will always run + # unless manually canceled. + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + if: '!cancelled()' + with: + sarif_file: 'trivy-results.sarif' \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 49c1fd26..7760c983 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM apache/airflow:2.7.2-python3.11 USER root RUN apt-get update && \ - apt-get install -y git nano vim + apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow RUN pip install -r requirements.txt diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index a68ba92b..00000000 --- a/Jenkinsfile +++ /dev/null @@ -1,84 +0,0 @@ -library 'pipeline-utils@master' - -pipeline { - agent { - kubernetes { - label 'kaniko-build-agent' - yaml ''' -kind: Pod -metadata: - name: kaniko -spec: - containers: - - name: jnlp - workingDir: /home/jenkins/agent - - name: kaniko - workingDir: /home/jenkins/agent - image: gcr.io/kaniko-project/executor:debug - imagePullPolicy: Always - resources: - requests: - cpu: "512m" - memory: "1024Mi" - ephemeral-storage: "4Gi" - limits: - cpu: "1024m" - memory: "2048Mi" - ephemeral-storage: "8Gi" - command: - - /busybox/cat - tty: true - volumeMounts: - - name: jenkins-docker-cfg - mountPath: /kaniko/.docker - volumes: - - name: jenkins-docker-cfg - projected: - sources: - - secret: - name: rencibuild-imagepull-secret - items: - - key: .dockerconfigjson - path: config.json -''' - } - } - environment { - PATH = "/busybox:/kaniko:/ko-app/:$PATH" - DOCKERHUB_CREDS = credentials("${env.CONTAINERS_REGISTRY_CREDS_ID_STR}") - REGISTRY = "${env.REGISTRY}" - REG_OWNER="helxplatform" - REG_APP="roger" - COMMIT_HASH="${sh(script:"git rev-parse --short HEAD", returnStdout: true).trim()}" - VERSION_FILE="./dags/_version.py" - VERSION="${sh(script:'awk \'{ print $3 }\' ./dags/_version.py | xargs', returnStdout: true).trim()}" - IMAGE_NAME="${REGISTRY}/${REG_OWNER}/${REG_APP}" - TAG1="$BRANCH_NAME" - TAG2="$COMMIT_HASH" - TAG3="$VERSION" - TAG4="latest" - } - stages { - stage('Test') { - steps { - sh ''' - echo "Test stage" - ''' - } - } - stage('Build') { - steps { - script { - container(name: 'kaniko', shell: '/busybox/sh') { - if (env.BRANCH_NAME == "main") { - // Tag with latest and version iff when pushed to master - kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2", "$IMAGE_NAME:$TAG3", "$IMAGE_NAME:$TAG4"]) - } else { - kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2"]) - } - } - } - } - } - } -} diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py index ac9eb23a..71111f39 100644 --- a/dags/roger/config/__init__.py +++ b/dags/roger/config/__init__.py @@ -99,6 +99,21 @@ class AnnotationConfig(DictLike): "sapbert": { "classification_url": "https://med-nemo.apps.renci.org/annotate/", "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/", + "score_threshold": 0.8, + "bagel": { + "enabled": False, + "url": "https://bagel.apps.renci.org/group_synonyms_openai", + "prompt": "bagel/ask_classes", + "llm_args": { + "llm_model_name": "gpt-4o-2024-05-13", + "organization": "", + "access_key": "", + "llm_model_args": { + "top_p": 0, + "temperature": 0.1 + } + } + } }, } ) @@ -119,6 +134,9 @@ class AnnotationConfig(DictLike): "PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS" ]) + def __post_init__(self): + self.annotator_args["sapbert"]["bagel"]["enabled"] = self.annotator_args["sapbert"]["bagel"][ + "enabled"].lower() == "true" @dataclass diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml index e9402ce4..c407555f 100644 --- a/dags/roger/config/config.yaml +++ b/dags/roger/config/config.yaml @@ -1,6 +1,6 @@ redisgraph: username: "" - password: "12345" + password: "weak" host: localhost graph: test port: 6379 @@ -42,13 +42,25 @@ bulk_loader: annotation: clear_http_cache: false - annotator_type: monarch + annotator_type: sapbert annotator_args: monarch: url: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" sapbert: classification_url: "https://med-nemo.apps.renci.org/annotate/" - annotator_url: "https://babel-sapbert.apps.renci.org/annotate/" + annotator_url: "https://sap-qdrant.apps.renci.org/annotate/" + score_threshold: 0.8 + bagel: + enabled: false + url: "http://localhost:9099/group_synonyms_openai" + prompt: "bagel/ask_classes" + llm_args: + llm_model_name: "gpt-4o-2024-05-13" + organization: + access_key: + llm_model_args: + top_p: 0 + temperature: 0.1 normalizer: "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" synonym_service: "https://name-resolution-sri.renci.org/reverse_lookup" ontology_metadata: "https://api.monarchinitiative.org/api/bioentity/" @@ -93,9 +105,9 @@ indexing: action: "files" elasticsearch: - host: elasticsearch + host: localhost username: elastic - password: "" + password: "12345" nboost_host: "" scheme: "http" ca_path: "" diff --git a/dags/roger/pipelines/bdc_pipelines.py b/dags/roger/pipelines/bdc_pipelines.py new file mode 100644 index 00000000..5a945641 --- /dev/null +++ b/dags/roger/pipelines/bdc_pipelines.py @@ -0,0 +1,48 @@ +"Dug pipeline for dbGaP data set" + +from roger.pipelines import DugPipeline + +class BIOLINCCdbGaPPipeline(DugPipeline): + "Pipeline for the dbGaP data set" + pipeline_name = 'biolincc' + parser_name = 'biolincc' + + +class covid19dbGaPPipeline(DugPipeline): + "Pipeline for the dbGaP data set" + pipeline_name = 'covid19-dbgap' + parser_name = 'covid19' + +class dirDbGaPPipeline(DugPipeline): + pipeline_name = "dir-dbgap" + parser_name = "dir" + +class LungMapDbGaPPipeline(DugPipeline): + pipeline_name = "lungmap-dbgap" + parser_name = "lungmap" + +class nsrrDbGaPPipeline(DugPipeline): + pipeline_name = "nsrr-dbgap" + parser_name = "nsrr" + +class ParentDbGaPPipeline(DugPipeline): + pipeline_name = "parent-dbgap" + parser_name = "parent" + +class PCGCDbGaPPipeline(DugPipeline): + pipeline_name = "pcgc-dbgap" + parser_name = "pcgc" + +class RecoverDbGaPPipeline(DugPipeline): + pipeline_name = "recover-dbgap" + parser_name = "recover" + +class TopmedDBGaPPipeline(DugPipeline): + pipeline_name = "topmed-gen3-dbgap" + parser_name = "topmeddbgap" + +class CureSCPipeline(DugPipeline): + pipeline_name = "curesc-dbgap" + parser_name = "curesc" + + diff --git a/requirements.txt b/requirements.txt index d1b1f68f..3a0ee223 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ jsonpickle redisgraph-bulk-loader==0.12.3 pytest PyYAML -git+https://github.com/helxplatform/dug@2.13.1 +git+https://github.com/helxplatform/dug@2.13.2 orjson kg-utils==0.0.6 bmt==1.1.0