Merge pull request #877 from jhu-bids/bugfixes-db-config-etc

Various DB config/refresh bug fixes
jhu-bids · Sep 12, 2024 · 7e71ee1 · 7e71ee1
2 parents e8aec5f + 7dde021
commit 7e71ee1
Show file tree

Hide file tree

Showing 17 changed files with 136 additions and 135 deletions.
diff --git a/backend/db/config.py b/backend/db/config.py
@@ -87,20 +87,6 @@ def get_pg_connect_url(local=False):
 }
 # Table/View configuration: for n3c schema
 CORE_CSET_TABLES = ['code_sets', 'concept_set_container', 'concept_set_version_item', 'concept_set_members']
-# todo: add comment for CORE_CSET_DEPENDENT_TABLES. What does this tell us easily that DERIVED_TABLE_DEPENDENCY_MAP does
-#  not? Is it not computable from DERIVED_TABLE_DEPENDENCY_MAP?
-CORE_CSET_DEPENDENT_TABLES = [
-    # tables
-    'cset_members_items',
-    'codeset_ids_by_concept_id',
-    'concept_ids_by_codeset_id',
-    'members_items_summary',
-    'codeset_counts',
-    'all_csets',
-    # views
-    # 'csets_to_ignore',
-    'cset_members_items_plus',
-]
 # STANDALONE_TABLES & DERIVED_TABLE_DEPENDENCY_MAP
 #  - 100% of tables in the main schema, e.g. n3c, should be listed somewhere in
 # STANDALONE_TABLES: Not derived from any other table, nor used to derive any other table/view. Used for QC testing.
@@ -147,6 +133,7 @@ def get_pg_connect_url(local=False):
     # - views
     # 'csets_to_ignore': ['all_csets'],
     'cset_members_items_plus': ['cset_members_items', 'concept'],
+    'all_csets_view': ['all_csets'],
 
     # Unfinished / unsure
     # - unsure what to do with these. they probably aren't derived either

diff --git a/backend/db/ddl-11-all_csets.jinja.sql b/backend/db/ddl-11-all_csets.jinja.sql
@@ -81,25 +81,4 @@ CREATE INDEX ac_idx1{{optional_index_suffix}} ON {{schema}}all_csets{{optional_s
 
 CREATE INDEX ac_idx2{{optional_index_suffix}} ON {{schema}}all_csets{{optional_suffix}}(concept_set_name);
 
-DROP TABLE {{schema}}cset_term_usage_rec_counts;
-
-
-CREATE OR REPLACE VIEW {{schema}}all_csets_view{{optional_suffix}} AS (
-    SELECT
-        codeset_id,
-        project,
-        alias,
-        is_most_recent_version AS mrv,
-        version AS v,
-        is_draft AS draft,
-        archived AS arch,
-        codeset_created_at::date AS ver_create,
-        container_created_at::date AS cont_create,
-        omop_vocab_version AS omop_voc,
-        distinct_person_cnt AS perscnt,
-        total_cnt AS totcnt,
-        flag_cnts,
-        concepts,
-        container_creator,
-        codeset_creator
-    FROM {{schema}}all_csets{{optional_suffix}});
+DROP TABLE {{schema}}cset_term_usage_rec_counts;
diff --git a/backend/db/ddl-12-all_csets_view.jinja.sql b/backend/db/ddl-12-all_csets_view.jinja.sql
@@ -0,0 +1,20 @@
+-- View: all_csets_view ----------------------------------------------------------------------------------------------------
+CREATE OR REPLACE VIEW {{schema}}all_csets_view{{optional_suffix}} AS (
+    SELECT
+        codeset_id,
+        project,
+        alias,
+        is_most_recent_version AS mrv,
+        version AS v,
+        is_draft AS draft,
+        archived AS arch,
+        codeset_created_at::date AS ver_create,
+        container_created_at::date AS cont_create,
+        omop_vocab_version AS omop_voc,
+        distinct_person_cnt AS perscnt,
+        total_cnt AS totcnt,
+        flag_cnts,
+        concepts,
+        container_creator,
+        codeset_creator
+    FROM {{schema}}all_csets);
diff --git a/.../ddl-12-cset_members_items_plus.jinja.sql → .../ddl-13-cset_members_items_plus.jinja.sql b/.../ddl-12-cset_members_items_plus.jinja.sql → .../ddl-13-cset_members_items_plus.jinja.sql
@@ -5,7 +5,7 @@ SELECT csmi.*
         , c.concept_name
         , c.concept_class_id
 FROM {{schema}}cset_members_items csmi
-JOIN concept c ON csmi.concept_id = c.concept_id);
+JOIN {{schema}}concept c ON csmi.concept_id = c.concept_id);
 -- CREATE INDEX csmip_idx1{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(codeset_id);
 -- CREATE INDEX csmip_idx2{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(concept_id);
 -- CREATE INDEX csmip_idx3{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(codeset_id, concept_id);

diff --git a/...-concepts_with_counts_ungrouped.jinja.sql → ...-concepts_with_counts_ungrouped.jinja.sql b/...-concepts_with_counts_ungrouped.jinja.sql → ...-concepts_with_counts_ungrouped.jinja.sql
diff --git a/.../db/ddl-14-concepts_with_counts.jinja.sql → .../db/ddl-15-concepts_with_counts.jinja.sql b/.../db/ddl-14-concepts_with_counts.jinja.sql → .../db/ddl-15-concepts_with_counts.jinja.sql
diff --git a/...dl-15-concept_relationship_plus.jinja.sql → ...dl-16-concept_relationship_plus.jinja.sql b/...dl-15-concept_relationship_plus.jinja.sql → ...dl-16-concept_relationship_plus.jinja.sql
diff --git a/...db/ddl-16-concept_ancestor_plus.jinja.sql → ...db/ddl-17-concept_ancestor_plus.jinja.sql b/...db/ddl-16-concept_ancestor_plus.jinja.sql → ...db/ddl-17-concept_ancestor_plus.jinja.sql
diff --git a/...ddl-17-add-service-user-to-researcher.sql → ...ddl-18-add-service-user-to-researcher.sql b/...ddl-17-add-service-user-to-researcher.sql → ...ddl-18-add-service-user-to-researcher.sql
diff --git a/backend/db/ddl-18-apirun_groups.jinja.sql → backend/db/ddl-19-apirun_groups.jinja.sql b/backend/db/ddl-18-apirun_groups.jinja.sql → backend/db/ddl-19-apirun_groups.jinja.sql
diff --git a/backend/db/ddl-19-concept_graph.jinja.sql → backend/db/ddl-20-concept_graph.jinja.sql b/backend/db/ddl-19-concept_graph.jinja.sql → backend/db/ddl-20-concept_graph.jinja.sql
diff --git a/backend/db/ddl-20-concept_set_json.jinja.sql → backend/db/ddl-21-concept_set_json.jinja.sql b/backend/db/ddl-20-concept_set_json.jinja.sql → backend/db/ddl-21-concept_set_json.jinja.sql
diff --git a/backend/db/utils.py b/backend/db/utils.py
@@ -35,8 +35,7 @@
 DB_DIR = os.path.dirname(os.path.realpath(__file__))
 PROJECT_ROOT = Path(DB_DIR).parent.parent
 sys.path.insert(0, str(PROJECT_ROOT))
-from backend.db.config import CORE_CSET_DEPENDENT_TABLES, CORE_CSET_TABLES, PG_DATATYPES_BY_GROUP, \
-    RECURSIVE_DEPENDENT_TABLE_MAP, \
+from backend.db.config import CORE_CSET_TABLES, PG_DATATYPES_BY_GROUP, RECURSIVE_DEPENDENT_TABLE_MAP, \
     REFRESH_JOB_MAX_HRS, get_pg_connect_url
 from backend.config import CONFIG, DATASETS_PATH, OBJECTS_PATH
 from backend.utils import commify
@@ -50,6 +49,12 @@
 SCHEMA = CONFIG["schema"]
 
 
+def dedupe_dicts(list_of_dicts: List[Dict]) -> List[Dict]:
+    """Dedupe list of dictionaries"""
+    # noinspection PyTypeChecker
+    return list(map(dict, set(tuple(sorted(d.items())) for d in list_of_dicts)))
+
+
 def extract_keys_from_nested_dict(d: Dict[str, Dict]) -> List[str]:
     """Extract keys from a nested dictionary.
 
@@ -80,8 +85,8 @@ def get_dependent_tables_queue(independent_tables: Union[List[str], str], _filte
     todo: Replace heuristic w/ a correct algorithm.
      I originally had no steps 2&3, and only 1&4 combined. But the result was out of order. This algorithm below is
      based on a quick (but messy/long) heuristic. Basically, the longer dependency trees go first. This corrected the
-     problem that I had. But this is just a heuristic. I'm feel confident that there is some correct algorithm for this
-     solvable in polynomial time. When this is done, probably should delete CORE_CSET_DEPENDENT_TABLES & its usages.
+     problem that I had. But this is just a heuristic. I feel confident that there is some correct algorithm for this
+     solvable in polynomial time.
     """
     if _filter not in [None, 'tables', 'views']:
         raise ValueError(f'Invalid _filter value: {_filter}. Must be one of "tables" or "views".')
@@ -148,7 +153,7 @@ def refresh_any_dependent_tables(con: Connection, independent_tables: List[str]
 
 
 def refresh_derived_tables_exec(
-    con: Connection, derived_tables_queue: List[str] = CORE_CSET_DEPENDENT_TABLES, schema=SCHEMA
+    con: Connection, derived_tables_queue: List[str], schema=SCHEMA
 ):
     """Refresh TermHub core cset derived tables
 
@@ -227,10 +232,6 @@ def refresh_derived_tables(
         else:
             try:
                 update_db_status_var('last_derived_refresh_request', current_datetime(), local)
-                # The following two calls yield equivalent results as of 2023/08/08. I've commented out
-                #  refresh_derived_tables() in case anything goes wrong with refresh_any_dependent_tables(), since that
-                #  is based on a heuristic currently, and if anything goes wrong, we may want to switch back. -joeflack4
-                # refresh_derived_tables_exec(con, CORE_CSET_DEPENDENT_TABLES, schema)
                 refresh_any_dependent_tables(con, independent_tables, schema)
             finally:
                 update_db_status_var('last_derived_refresh_exited', current_datetime(), local)
@@ -667,7 +668,7 @@ def insert_from_dicts(con: Connection, table: str, rows: List[Dict], skip_if_alr
     if skip_if_already_exists:
         if pk and isinstance(pk, str):  # normal, single primary key
             already_in_db: List[Dict] = get_objs_by_id(con, table, pk, [row[pk] for row in rows])
-            already_in_db_ids = [row[pk] for row in already_in_db]
+            already_in_db_ids = set([row[pk] for row in already_in_db])
             rows = [row for row in rows if row[pk] not in already_in_db_ids]
         elif pk and isinstance(pk, list):  # composite key
             already_in_db: List[Dict] = get_objs_by_composite_key(con, table, pk, rows)