Skip to content

Commit

Permalink
Merge branch 'develop' of github.com:jhu-bids/TermHub into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Sigfried committed Sep 12, 2024
2 parents 58a9989 + 7e71ee1 commit de1c4fb
Show file tree
Hide file tree
Showing 18 changed files with 165 additions and 142 deletions.
15 changes: 1 addition & 14 deletions backend/db/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,20 +87,6 @@ def get_pg_connect_url(local=False):
}
# Table/View configuration: for n3c schema
CORE_CSET_TABLES = ['code_sets', 'concept_set_container', 'concept_set_version_item', 'concept_set_members']
# todo: add comment for CORE_CSET_DEPENDENT_TABLES. What does this tell us easily that DERIVED_TABLE_DEPENDENCY_MAP does
# not? Is it not computable from DERIVED_TABLE_DEPENDENCY_MAP?
CORE_CSET_DEPENDENT_TABLES = [
# tables
'cset_members_items',
'codeset_ids_by_concept_id',
'concept_ids_by_codeset_id',
'members_items_summary',
'codeset_counts',
'all_csets',
# views
# 'csets_to_ignore',
'cset_members_items_plus',
]
# STANDALONE_TABLES & DERIVED_TABLE_DEPENDENCY_MAP
# - 100% of tables in the main schema, e.g. n3c, should be listed somewhere in
# STANDALONE_TABLES: Not derived from any other table, nor used to derive any other table/view. Used for QC testing.
Expand Down Expand Up @@ -147,6 +133,7 @@ def get_pg_connect_url(local=False):
# - views
# 'csets_to_ignore': ['all_csets'],
'cset_members_items_plus': ['cset_members_items', 'concept'],
'all_csets_view': ['all_csets'],

# Unfinished / unsure
# - unsure what to do with these. they probably aren't derived either
Expand Down
23 changes: 1 addition & 22 deletions backend/db/ddl-11-all_csets.jinja.sql
Original file line number Diff line number Diff line change
Expand Up @@ -81,25 +81,4 @@ CREATE INDEX ac_idx1{{optional_index_suffix}} ON {{schema}}all_csets{{optional_s

CREATE INDEX ac_idx2{{optional_index_suffix}} ON {{schema}}all_csets{{optional_suffix}}(concept_set_name);

DROP TABLE {{schema}}cset_term_usage_rec_counts;


CREATE OR REPLACE VIEW {{schema}}all_csets_view{{optional_suffix}} AS (
SELECT
codeset_id,
project,
alias,
is_most_recent_version AS mrv,
version AS v,
is_draft AS draft,
archived AS arch,
codeset_created_at::date AS ver_create,
container_created_at::date AS cont_create,
omop_vocab_version AS omop_voc,
distinct_person_cnt AS perscnt,
total_cnt AS totcnt,
flag_cnts,
concepts,
container_creator,
codeset_creator
FROM {{schema}}all_csets{{optional_suffix}});
DROP TABLE {{schema}}cset_term_usage_rec_counts;
20 changes: 20 additions & 0 deletions backend/db/ddl-12-all_csets_view.jinja.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
-- View: all_csets_view ----------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW {{schema}}all_csets_view{{optional_suffix}} AS (
SELECT
codeset_id,
project,
alias,
is_most_recent_version AS mrv,
version AS v,
is_draft AS draft,
archived AS arch,
codeset_created_at::date AS ver_create,
container_created_at::date AS cont_create,
omop_vocab_version AS omop_voc,
distinct_person_cnt AS perscnt,
total_cnt AS totcnt,
flag_cnts,
concepts,
container_creator,
codeset_creator
FROM {{schema}}all_csets);
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ SELECT csmi.*
, c.concept_name
, c.concept_class_id
FROM {{schema}}cset_members_items csmi
JOIN concept c ON csmi.concept_id = c.concept_id);
JOIN {{schema}}concept c ON csmi.concept_id = c.concept_id);
-- CREATE INDEX csmip_idx1{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(codeset_id);
-- CREATE INDEX csmip_idx2{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(concept_id);
-- CREATE INDEX csmip_idx3{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(codeset_id, concept_id);
Expand Down
File renamed without changes.
File renamed without changes.
55 changes: 39 additions & 16 deletions backend/db/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@
DB_DIR = os.path.dirname(os.path.realpath(__file__))
PROJECT_ROOT = Path(DB_DIR).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from backend.db.config import CORE_CSET_DEPENDENT_TABLES, CORE_CSET_TABLES, PG_DATATYPES_BY_GROUP, \
RECURSIVE_DEPENDENT_TABLE_MAP, \
from backend.db.config import CORE_CSET_TABLES, PG_DATATYPES_BY_GROUP, RECURSIVE_DEPENDENT_TABLE_MAP, \
REFRESH_JOB_MAX_HRS, get_pg_connect_url
from backend.config import CONFIG, DATASETS_PATH, OBJECTS_PATH
from backend.utils import commify
Expand All @@ -50,6 +49,12 @@
SCHEMA = CONFIG["schema"]


def dedupe_dicts(list_of_dicts: List[Dict]) -> List[Dict]:
"""Dedupe list of dictionaries"""
# noinspection PyTypeChecker
return list(map(dict, set(tuple(sorted(d.items())) for d in list_of_dicts)))


def extract_keys_from_nested_dict(d: Dict[str, Dict]) -> List[str]:
"""Extract keys from a nested dictionary.
Expand All @@ -67,24 +72,29 @@ def _extract_keys(d2):
return ordered_unique_keys


def get_dependent_tables_queue(independent_tables: List[str]) -> List[str]:
def get_dependent_tables_queue(independent_tables: Union[List[str], str], _filter: str = None) -> List[str]:
"""From independent_tables, get a list of all tables that depend on those tables.
:param independent_tables: The tables from which you are trying to get an ordered list of dependent tables from.
This can be used in a situation such as: You are updating 1+ database tables 'root_tables', and now want to find
which derived tables need to be updated in the correct order.
:param _filter: One of 'table' or 'views'.
:return: A list in the correct order such that for every entry in the list, any tables that depend on that entry
will appear further down in the list.
todo: Replace heuristic w/ a correct algorithm.
I originally had no steps 2&3, and only 1&4 combined. But the result was out of order. This algorithm below is
based on a quick (but messy/long) heuristic. Basically, the longer dependency trees go first. This corrected the
problem that I had. But this is just a heuristic. I'm feel confident that there is some correct algorithm for this
solvable in polynomial time. When this is done, probably should delete CORE_CSET_DEPENDENT_TABLES & its usages."""
problem that I had. But this is just a heuristic. I feel confident that there is some correct algorithm for this
solvable in polynomial time.
"""
if _filter not in [None, 'tables', 'views']:
raise ValueError(f'Invalid _filter value: {_filter}. Must be one of "tables" or "views".')
final_queue: List[str] = []
table_queues1: List[List[str]] = []
table_queues2: List[List[str]] = []
queues_by_len: Dict[int, List[List[str]]] = {}
independent_tables: List[str] = [independent_tables] if isinstance(independent_tables, str) else independent_tables

# 1 & 4: Get a queue of dependent tables
# 1. Build up a list of queues; queue is dependent tables
Expand Down Expand Up @@ -119,6 +129,14 @@ def get_dependent_tables_queue(independent_tables: List[str]) -> List[str]:
if i not in final_queue:
final_queue.append(i)

# 5. Optional: Filtering
if _filter:
views: List[str] = [x for x in list_views() if x in final_queue]
if _filter == 'views':
return views
elif _filter == 'tables':
return [x for x in final_queue if x not in views]

return final_queue


Expand All @@ -135,7 +153,7 @@ def refresh_any_dependent_tables(con: Connection, independent_tables: List[str]


def refresh_derived_tables_exec(
con: Connection, derived_tables_queue: List[str] = CORE_CSET_DEPENDENT_TABLES, schema=SCHEMA
con: Connection, derived_tables_queue: List[str], schema=SCHEMA
):
"""Refresh TermHub core cset derived tables
Expand All @@ -153,7 +171,7 @@ def refresh_derived_tables_exec(
for module in ddl_modules_queue:
t0_2 = datetime.now()
print(f' - creating new table/view: {module}...')
statements: List[str] = get_ddl_statements(schema, [module], temp_table_suffix, 'flat')
statements: List[str] = get_ddl_statements(schema, module, temp_table_suffix, 'flat')
for statement in statements:
try:
run_sql(con, statement)
Expand Down Expand Up @@ -214,10 +232,6 @@ def refresh_derived_tables(
else:
try:
update_db_status_var('last_derived_refresh_request', current_datetime(), local)
# The following two calls yield equivalent results as of 2023/08/08. I've commented out
# refresh_derived_tables() in case anything goes wrong with refresh_any_dependent_tables(), since that
# is based on a heuristic currently, and if anything goes wrong, we may want to switch back. -joeflack4
# refresh_derived_tables_exec(con, CORE_CSET_DEPENDENT_TABLES, schema)
refresh_any_dependent_tables(con, independent_tables, schema)
finally:
update_db_status_var('last_derived_refresh_exited', current_datetime(), local)
Expand Down Expand Up @@ -654,7 +668,7 @@ def insert_from_dicts(con: Connection, table: str, rows: List[Dict], skip_if_alr
if skip_if_already_exists:
if pk and isinstance(pk, str): # normal, single primary key
already_in_db: List[Dict] = get_objs_by_id(con, table, pk, [row[pk] for row in rows])
already_in_db_ids = [row[pk] for row in already_in_db]
already_in_db_ids = set([row[pk] for row in already_in_db])
rows = [row for row in rows if row[pk] not in already_in_db_ids]
elif pk and isinstance(pk, list): # composite key
already_in_db: List[Dict] = get_objs_by_composite_key(con, table, pk, rows)
Expand Down Expand Up @@ -850,7 +864,7 @@ def list_tables(con: Connection = None, schema: str = None, filter_temp_refresh_


def get_ddl_statements(
schema: str = SCHEMA, modules: List[str] = None, table_suffix='',return_type=['flat', 'nested'][1],
schema: str = SCHEMA, modules: Union[List[str], str] = None, table_suffix='', return_type=['flat', 'nested'][1],
unique_index_names=True,
) -> Union[List[str], Dict[str, List[str]]]:
"""From local SQL DDL Jinja2 templates, pa rse and get a list of SQL statements to run.
Expand All @@ -866,7 +880,9 @@ def get_ddl_statements(
2. Add alters to fix data types (although, should really move this stuff to dtypes settings when creating
dataframe that loads data into db.
3. I think it's inserting a second ; at the end of the last statement of a given module
4. consider throwing an error if no statements found, either here, or where func is called"""
4. consider throwing an error if no statements found, either here, or where func is called
"""
modules: List[str] = [modules] if isinstance(modules, str) else modules
index_suffix: str = '' if not unique_index_names else '_' + str(randint(10000000, 99999999))
paths: List[str] = glob(DDL_JINJA_PATH_PATTERN)
if modules:
Expand Down Expand Up @@ -919,11 +935,19 @@ def reset_temp_refresh_tables(schema: str = SCHEMA, local=False, verbose=True):
# Get all tables/views
items: List[str] = func(con)
# _old tables/views
# - For each, drop the current one and replace w/ the backed up one. Drop and remake any dependent views.
backed_up_items = [t for t in items if t.endswith('_old')]
for item in backed_up_items:
item = item.replace('_old', '')
dependent_views: List[str] = get_dependent_tables_queue(item, 'views')
for view in dependent_views:
run_sql(con, f'DROP VIEW IF EXISTS {schema}.{view};') # alternative: drop table w/ CASCADE
run_sql(con, f'DROP {item_type} IF EXISTS {schema}.{item};')
run_sql(con, f'ALTER {item_type} {schema}.{item}_old RENAME TO {item};')
for view in dependent_views:
statements: List[str] = get_ddl_statements(schema, view, return_type='flat')
for statement in statements:
run_sql(con, statement)
# _new tables/views
dangling_new_items = [t for t in items if t.endswith('_new')]
for item in dangling_new_items:
Expand Down Expand Up @@ -982,5 +1006,4 @@ def cli():


if __name__ == '__main__':
# cli()
get_db_connection(schema='xyz')
cli()
Loading

0 comments on commit de1c4fb

Please sign in to comment.