Merge branch 'develop' of github.com:jhu-bids/TermHub into develop

jhu-bids · Sep 12, 2024 · de1c4fb · de1c4fb
2 parents 58a9989 + 7e71ee1
commit de1c4fb
Show file tree

Hide file tree

Showing 18 changed files with 165 additions and 142 deletions.
diff --git a/backend/db/config.py b/backend/db/config.py
@@ -87,20 +87,6 @@ def get_pg_connect_url(local=False):
 }
 # Table/View configuration: for n3c schema
 CORE_CSET_TABLES = ['code_sets', 'concept_set_container', 'concept_set_version_item', 'concept_set_members']
-# todo: add comment for CORE_CSET_DEPENDENT_TABLES. What does this tell us easily that DERIVED_TABLE_DEPENDENCY_MAP does
-#  not? Is it not computable from DERIVED_TABLE_DEPENDENCY_MAP?
-CORE_CSET_DEPENDENT_TABLES = [
-    # tables
-    'cset_members_items',
-    'codeset_ids_by_concept_id',
-    'concept_ids_by_codeset_id',
-    'members_items_summary',
-    'codeset_counts',
-    'all_csets',
-    # views
-    # 'csets_to_ignore',
-    'cset_members_items_plus',
-]
 # STANDALONE_TABLES & DERIVED_TABLE_DEPENDENCY_MAP
 #  - 100% of tables in the main schema, e.g. n3c, should be listed somewhere in
 # STANDALONE_TABLES: Not derived from any other table, nor used to derive any other table/view. Used for QC testing.
@@ -147,6 +133,7 @@ def get_pg_connect_url(local=False):
     # - views
     # 'csets_to_ignore': ['all_csets'],
     'cset_members_items_plus': ['cset_members_items', 'concept'],
+    'all_csets_view': ['all_csets'],
 
     # Unfinished / unsure
     # - unsure what to do with these. they probably aren't derived either

diff --git a/backend/db/ddl-11-all_csets.jinja.sql b/backend/db/ddl-11-all_csets.jinja.sql
@@ -81,25 +81,4 @@ CREATE INDEX ac_idx1{{optional_index_suffix}} ON {{schema}}all_csets{{optional_s
 
 CREATE INDEX ac_idx2{{optional_index_suffix}} ON {{schema}}all_csets{{optional_suffix}}(concept_set_name);
 
-DROP TABLE {{schema}}cset_term_usage_rec_counts;
-
-
-CREATE OR REPLACE VIEW {{schema}}all_csets_view{{optional_suffix}} AS (
-    SELECT
-        codeset_id,
-        project,
-        alias,
-        is_most_recent_version AS mrv,
-        version AS v,
-        is_draft AS draft,
-        archived AS arch,
-        codeset_created_at::date AS ver_create,
-        container_created_at::date AS cont_create,
-        omop_vocab_version AS omop_voc,
-        distinct_person_cnt AS perscnt,
-        total_cnt AS totcnt,
-        flag_cnts,
-        concepts,
-        container_creator,
-        codeset_creator
-    FROM {{schema}}all_csets{{optional_suffix}});
+DROP TABLE {{schema}}cset_term_usage_rec_counts;
diff --git a/backend/db/ddl-12-all_csets_view.jinja.sql b/backend/db/ddl-12-all_csets_view.jinja.sql
@@ -0,0 +1,20 @@
+-- View: all_csets_view ----------------------------------------------------------------------------------------------------
+CREATE OR REPLACE VIEW {{schema}}all_csets_view{{optional_suffix}} AS (
+    SELECT
+        codeset_id,
+        project,
+        alias,
+        is_most_recent_version AS mrv,
+        version AS v,
+        is_draft AS draft,
+        archived AS arch,
+        codeset_created_at::date AS ver_create,
+        container_created_at::date AS cont_create,
+        omop_vocab_version AS omop_voc,
+        distinct_person_cnt AS perscnt,
+        total_cnt AS totcnt,
+        flag_cnts,
+        concepts,
+        container_creator,
+        codeset_creator
+    FROM {{schema}}all_csets);
diff --git a/.../ddl-12-cset_members_items_plus.jinja.sql → .../ddl-13-cset_members_items_plus.jinja.sql b/.../ddl-12-cset_members_items_plus.jinja.sql → .../ddl-13-cset_members_items_plus.jinja.sql
@@ -5,7 +5,7 @@ SELECT csmi.*
         , c.concept_name
         , c.concept_class_id
 FROM {{schema}}cset_members_items csmi
-JOIN concept c ON csmi.concept_id = c.concept_id);
+JOIN {{schema}}concept c ON csmi.concept_id = c.concept_id);
 -- CREATE INDEX csmip_idx1{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(codeset_id);
 -- CREATE INDEX csmip_idx2{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(concept_id);
 -- CREATE INDEX csmip_idx3{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(codeset_id, concept_id);

diff --git a/...-concepts_with_counts_ungrouped.jinja.sql → ...-concepts_with_counts_ungrouped.jinja.sql b/...-concepts_with_counts_ungrouped.jinja.sql → ...-concepts_with_counts_ungrouped.jinja.sql
diff --git a/.../db/ddl-14-concepts_with_counts.jinja.sql → .../db/ddl-15-concepts_with_counts.jinja.sql b/.../db/ddl-14-concepts_with_counts.jinja.sql → .../db/ddl-15-concepts_with_counts.jinja.sql
diff --git a/...dl-15-concept_relationship_plus.jinja.sql → ...dl-16-concept_relationship_plus.jinja.sql b/...dl-15-concept_relationship_plus.jinja.sql → ...dl-16-concept_relationship_plus.jinja.sql
diff --git a/...db/ddl-16-concept_ancestor_plus.jinja.sql → ...db/ddl-17-concept_ancestor_plus.jinja.sql b/...db/ddl-16-concept_ancestor_plus.jinja.sql → ...db/ddl-17-concept_ancestor_plus.jinja.sql
diff --git a/...ddl-17-add-service-user-to-researcher.sql → ...ddl-18-add-service-user-to-researcher.sql b/...ddl-17-add-service-user-to-researcher.sql → ...ddl-18-add-service-user-to-researcher.sql
diff --git a/backend/db/ddl-18-apirun_groups.jinja.sql → backend/db/ddl-19-apirun_groups.jinja.sql b/backend/db/ddl-18-apirun_groups.jinja.sql → backend/db/ddl-19-apirun_groups.jinja.sql
diff --git a/backend/db/ddl-19-concept_graph.jinja.sql → backend/db/ddl-20-concept_graph.jinja.sql b/backend/db/ddl-19-concept_graph.jinja.sql → backend/db/ddl-20-concept_graph.jinja.sql
diff --git a/backend/db/ddl-20-concept_set_json.jinja.sql → backend/db/ddl-21-concept_set_json.jinja.sql b/backend/db/ddl-20-concept_set_json.jinja.sql → backend/db/ddl-21-concept_set_json.jinja.sql
diff --git a/backend/db/utils.py b/backend/db/utils.py
@@ -35,8 +35,7 @@
 DB_DIR = os.path.dirname(os.path.realpath(__file__))
 PROJECT_ROOT = Path(DB_DIR).parent.parent
 sys.path.insert(0, str(PROJECT_ROOT))
-from backend.db.config import CORE_CSET_DEPENDENT_TABLES, CORE_CSET_TABLES, PG_DATATYPES_BY_GROUP, \
-    RECURSIVE_DEPENDENT_TABLE_MAP, \
+from backend.db.config import CORE_CSET_TABLES, PG_DATATYPES_BY_GROUP, RECURSIVE_DEPENDENT_TABLE_MAP, \
     REFRESH_JOB_MAX_HRS, get_pg_connect_url
 from backend.config import CONFIG, DATASETS_PATH, OBJECTS_PATH
 from backend.utils import commify
@@ -50,6 +49,12 @@
 SCHEMA = CONFIG["schema"]
 
 
+def dedupe_dicts(list_of_dicts: List[Dict]) -> List[Dict]:
+    """Dedupe list of dictionaries"""
+    # noinspection PyTypeChecker
+    return list(map(dict, set(tuple(sorted(d.items())) for d in list_of_dicts)))
+
+
 def extract_keys_from_nested_dict(d: Dict[str, Dict]) -> List[str]:
     """Extract keys from a nested dictionary.
 
@@ -67,24 +72,29 @@ def _extract_keys(d2):
     return ordered_unique_keys
 
 
-def get_dependent_tables_queue(independent_tables: List[str]) -> List[str]:
+def get_dependent_tables_queue(independent_tables: Union[List[str], str], _filter: str = None) -> List[str]:
     """From independent_tables, get a list of all tables that depend on those tables.
 
     :param independent_tables:  The tables from which you are trying to get an ordered list of dependent tables from.
     This can be used in a situation such as: You are updating 1+ database tables 'root_tables', and now want to find
     which derived tables need to be updated in the correct order.
+    :param _filter: One of 'table' or 'views'.
     :return: A list in the correct order such that for every entry in the list, any tables that depend on that entry
     will appear further down in the list.
 
     todo: Replace heuristic w/ a correct algorithm.
      I originally had no steps 2&3, and only 1&4 combined. But the result was out of order. This algorithm below is
      based on a quick (but messy/long) heuristic. Basically, the longer dependency trees go first. This corrected the
-     problem that I had. But this is just a heuristic. I'm feel confident that there is some correct algorithm for this
-     solvable in polynomial time. When this is done, probably should delete CORE_CSET_DEPENDENT_TABLES & its usages."""
+     problem that I had. But this is just a heuristic. I feel confident that there is some correct algorithm for this
+     solvable in polynomial time.
+    """
+    if _filter not in [None, 'tables', 'views']:
+        raise ValueError(f'Invalid _filter value: {_filter}. Must be one of "tables" or "views".')
     final_queue: List[str] = []
     table_queues1: List[List[str]] = []
     table_queues2: List[List[str]] = []
     queues_by_len: Dict[int, List[List[str]]] = {}
+    independent_tables: List[str] = [independent_tables] if isinstance(independent_tables, str) else independent_tables
 
     # 1 & 4: Get a queue of dependent tables
     # 1. Build up a list of queues; queue is dependent tables
@@ -119,6 +129,14 @@ def get_dependent_tables_queue(independent_tables: List[str]) -> List[str]:
             if i not in final_queue:
                 final_queue.append(i)
 
+    # 5. Optional: Filtering
+    if _filter:
+        views: List[str] = [x for x in list_views() if x in final_queue]
+        if _filter == 'views':
+            return views
+        elif _filter == 'tables':
+            return [x for x in final_queue if x not in views]
+
     return final_queue
 
 
@@ -135,7 +153,7 @@ def refresh_any_dependent_tables(con: Connection, independent_tables: List[str]
 
 
 def refresh_derived_tables_exec(
-    con: Connection, derived_tables_queue: List[str] = CORE_CSET_DEPENDENT_TABLES, schema=SCHEMA
+    con: Connection, derived_tables_queue: List[str], schema=SCHEMA
 ):
     """Refresh TermHub core cset derived tables
 
@@ -153,7 +171,7 @@ def refresh_derived_tables_exec(
     for module in ddl_modules_queue:
         t0_2 = datetime.now()
         print(f' - creating new table/view: {module}...')
-        statements: List[str] = get_ddl_statements(schema, [module], temp_table_suffix, 'flat')
+        statements: List[str] = get_ddl_statements(schema, module, temp_table_suffix, 'flat')
         for statement in statements:
             try:
                 run_sql(con, statement)
@@ -214,10 +232,6 @@ def refresh_derived_tables(
         else:
             try:
                 update_db_status_var('last_derived_refresh_request', current_datetime(), local)
-                # The following two calls yield equivalent results as of 2023/08/08. I've commented out
-                #  refresh_derived_tables() in case anything goes wrong with refresh_any_dependent_tables(), since that
-                #  is based on a heuristic currently, and if anything goes wrong, we may want to switch back. -joeflack4
-                # refresh_derived_tables_exec(con, CORE_CSET_DEPENDENT_TABLES, schema)
                 refresh_any_dependent_tables(con, independent_tables, schema)
             finally:
                 update_db_status_var('last_derived_refresh_exited', current_datetime(), local)
@@ -654,7 +668,7 @@ def insert_from_dicts(con: Connection, table: str, rows: List[Dict], skip_if_alr
     if skip_if_already_exists:
         if pk and isinstance(pk, str):  # normal, single primary key
             already_in_db: List[Dict] = get_objs_by_id(con, table, pk, [row[pk] for row in rows])
-            already_in_db_ids = [row[pk] for row in already_in_db]
+            already_in_db_ids = set([row[pk] for row in already_in_db])
             rows = [row for row in rows if row[pk] not in already_in_db_ids]
         elif pk and isinstance(pk, list):  # composite key
             already_in_db: List[Dict] = get_objs_by_composite_key(con, table, pk, rows)
@@ -850,7 +864,7 @@ def list_tables(con: Connection = None, schema: str = None, filter_temp_refresh_
 
 
 def get_ddl_statements(
-    schema: str = SCHEMA, modules: List[str] = None, table_suffix='',return_type=['flat', 'nested'][1],
+    schema: str = SCHEMA, modules: Union[List[str], str] = None, table_suffix='', return_type=['flat', 'nested'][1],
     unique_index_names=True,
 ) -> Union[List[str], Dict[str, List[str]]]:
     """From local SQL DDL Jinja2 templates, pa rse and get a list of SQL statements to run.
@@ -866,7 +880,9 @@ def get_ddl_statements(
       2. Add alters to fix data types (although, should really move this stuff to dtypes settings when creating
       dataframe that loads data into db.
       3. I think it's inserting a second ; at the end of the last statement of a given module
-      4. consider throwing an error if no statements found, either here, or where func is called"""
+      4. consider throwing an error if no statements found, either here, or where func is called
+    """
+    modules: List[str] = [modules] if isinstance(modules, str) else modules
     index_suffix: str = '' if not unique_index_names else '_' + str(randint(10000000, 99999999))
     paths: List[str] = glob(DDL_JINJA_PATH_PATTERN)
     if modules:
@@ -919,11 +935,19 @@ def reset_temp_refresh_tables(schema: str = SCHEMA, local=False, verbose=True):
             # Get all tables/views
             items: List[str] = func(con)
             # _old tables/views
+            #  - For each, drop the current one and replace w/ the backed up one. Drop and remake any dependent views.
             backed_up_items = [t for t in items if t.endswith('_old')]
             for item in backed_up_items:
                 item = item.replace('_old', '')
+                dependent_views: List[str] = get_dependent_tables_queue(item, 'views')
+                for view in dependent_views:
+                    run_sql(con, f'DROP VIEW IF EXISTS {schema}.{view};')  # alternative: drop table w/ CASCADE
                 run_sql(con, f'DROP {item_type} IF EXISTS {schema}.{item};')
                 run_sql(con, f'ALTER {item_type} {schema}.{item}_old RENAME TO {item};')
+                for view in dependent_views:
+                    statements: List[str] = get_ddl_statements(schema, view, return_type='flat')
+                    for statement in statements:
+                        run_sql(con, statement)
             # _new tables/views
             dangling_new_items = [t for t in items if t.endswith('_new')]
             for item in dangling_new_items:
@@ -982,5 +1006,4 @@ def cli():
 
 
 if __name__ == '__main__':
-    # cli()
-    get_db_connection(schema='xyz')
+    cli()