Skip to content

Commit

Permalink
Merge pull request #877 from jhu-bids/bugfixes-db-config-etc
Browse files Browse the repository at this point in the history
Various DB config/refresh bug fixes
  • Loading branch information
joeflack4 committed Sep 12, 2024
2 parents e8aec5f + 7dde021 commit 7e71ee1
Show file tree
Hide file tree
Showing 17 changed files with 136 additions and 135 deletions.
15 changes: 1 addition & 14 deletions backend/db/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,20 +87,6 @@ def get_pg_connect_url(local=False):
}
# Table/View configuration: for n3c schema
CORE_CSET_TABLES = ['code_sets', 'concept_set_container', 'concept_set_version_item', 'concept_set_members']
# todo: add comment for CORE_CSET_DEPENDENT_TABLES. What does this tell us easily that DERIVED_TABLE_DEPENDENCY_MAP does
# not? Is it not computable from DERIVED_TABLE_DEPENDENCY_MAP?
CORE_CSET_DEPENDENT_TABLES = [
# tables
'cset_members_items',
'codeset_ids_by_concept_id',
'concept_ids_by_codeset_id',
'members_items_summary',
'codeset_counts',
'all_csets',
# views
# 'csets_to_ignore',
'cset_members_items_plus',
]
# STANDALONE_TABLES & DERIVED_TABLE_DEPENDENCY_MAP
# - 100% of tables in the main schema, e.g. n3c, should be listed somewhere in
# STANDALONE_TABLES: Not derived from any other table, nor used to derive any other table/view. Used for QC testing.
Expand Down Expand Up @@ -147,6 +133,7 @@ def get_pg_connect_url(local=False):
# - views
# 'csets_to_ignore': ['all_csets'],
'cset_members_items_plus': ['cset_members_items', 'concept'],
'all_csets_view': ['all_csets'],

# Unfinished / unsure
# - unsure what to do with these. they probably aren't derived either
Expand Down
23 changes: 1 addition & 22 deletions backend/db/ddl-11-all_csets.jinja.sql
Original file line number Diff line number Diff line change
Expand Up @@ -81,25 +81,4 @@ CREATE INDEX ac_idx1{{optional_index_suffix}} ON {{schema}}all_csets{{optional_s

CREATE INDEX ac_idx2{{optional_index_suffix}} ON {{schema}}all_csets{{optional_suffix}}(concept_set_name);

DROP TABLE {{schema}}cset_term_usage_rec_counts;


CREATE OR REPLACE VIEW {{schema}}all_csets_view{{optional_suffix}} AS (
SELECT
codeset_id,
project,
alias,
is_most_recent_version AS mrv,
version AS v,
is_draft AS draft,
archived AS arch,
codeset_created_at::date AS ver_create,
container_created_at::date AS cont_create,
omop_vocab_version AS omop_voc,
distinct_person_cnt AS perscnt,
total_cnt AS totcnt,
flag_cnts,
concepts,
container_creator,
codeset_creator
FROM {{schema}}all_csets{{optional_suffix}});
DROP TABLE {{schema}}cset_term_usage_rec_counts;
20 changes: 20 additions & 0 deletions backend/db/ddl-12-all_csets_view.jinja.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
-- View: all_csets_view ----------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW {{schema}}all_csets_view{{optional_suffix}} AS (
SELECT
codeset_id,
project,
alias,
is_most_recent_version AS mrv,
version AS v,
is_draft AS draft,
archived AS arch,
codeset_created_at::date AS ver_create,
container_created_at::date AS cont_create,
omop_vocab_version AS omop_voc,
distinct_person_cnt AS perscnt,
total_cnt AS totcnt,
flag_cnts,
concepts,
container_creator,
codeset_creator
FROM {{schema}}all_csets);
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ SELECT csmi.*
, c.concept_name
, c.concept_class_id
FROM {{schema}}cset_members_items csmi
JOIN concept c ON csmi.concept_id = c.concept_id);
JOIN {{schema}}concept c ON csmi.concept_id = c.concept_id);
-- CREATE INDEX csmip_idx1{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(codeset_id);
-- CREATE INDEX csmip_idx2{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(concept_id);
-- CREATE INDEX csmip_idx3{{optional_index_suffix}} ON {{schema}}cset_members_items_plus{{optional_suffix}}(codeset_id, concept_id);
Expand Down
File renamed without changes.
File renamed without changes.
21 changes: 11 additions & 10 deletions backend/db/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@
DB_DIR = os.path.dirname(os.path.realpath(__file__))
PROJECT_ROOT = Path(DB_DIR).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from backend.db.config import CORE_CSET_DEPENDENT_TABLES, CORE_CSET_TABLES, PG_DATATYPES_BY_GROUP, \
RECURSIVE_DEPENDENT_TABLE_MAP, \
from backend.db.config import CORE_CSET_TABLES, PG_DATATYPES_BY_GROUP, RECURSIVE_DEPENDENT_TABLE_MAP, \
REFRESH_JOB_MAX_HRS, get_pg_connect_url
from backend.config import CONFIG, DATASETS_PATH, OBJECTS_PATH
from backend.utils import commify
Expand All @@ -50,6 +49,12 @@
SCHEMA = CONFIG["schema"]


def dedupe_dicts(list_of_dicts: List[Dict]) -> List[Dict]:
"""Dedupe list of dictionaries"""
# noinspection PyTypeChecker
return list(map(dict, set(tuple(sorted(d.items())) for d in list_of_dicts)))


def extract_keys_from_nested_dict(d: Dict[str, Dict]) -> List[str]:
"""Extract keys from a nested dictionary.
Expand Down Expand Up @@ -80,8 +85,8 @@ def get_dependent_tables_queue(independent_tables: Union[List[str], str], _filte
todo: Replace heuristic w/ a correct algorithm.
I originally had no steps 2&3, and only 1&4 combined. But the result was out of order. This algorithm below is
based on a quick (but messy/long) heuristic. Basically, the longer dependency trees go first. This corrected the
problem that I had. But this is just a heuristic. I'm feel confident that there is some correct algorithm for this
solvable in polynomial time. When this is done, probably should delete CORE_CSET_DEPENDENT_TABLES & its usages.
problem that I had. But this is just a heuristic. I feel confident that there is some correct algorithm for this
solvable in polynomial time.
"""
if _filter not in [None, 'tables', 'views']:
raise ValueError(f'Invalid _filter value: {_filter}. Must be one of "tables" or "views".')
Expand Down Expand Up @@ -148,7 +153,7 @@ def refresh_any_dependent_tables(con: Connection, independent_tables: List[str]


def refresh_derived_tables_exec(
con: Connection, derived_tables_queue: List[str] = CORE_CSET_DEPENDENT_TABLES, schema=SCHEMA
con: Connection, derived_tables_queue: List[str], schema=SCHEMA
):
"""Refresh TermHub core cset derived tables
Expand Down Expand Up @@ -227,10 +232,6 @@ def refresh_derived_tables(
else:
try:
update_db_status_var('last_derived_refresh_request', current_datetime(), local)
# The following two calls yield equivalent results as of 2023/08/08. I've commented out
# refresh_derived_tables() in case anything goes wrong with refresh_any_dependent_tables(), since that
# is based on a heuristic currently, and if anything goes wrong, we may want to switch back. -joeflack4
# refresh_derived_tables_exec(con, CORE_CSET_DEPENDENT_TABLES, schema)
refresh_any_dependent_tables(con, independent_tables, schema)
finally:
update_db_status_var('last_derived_refresh_exited', current_datetime(), local)
Expand Down Expand Up @@ -667,7 +668,7 @@ def insert_from_dicts(con: Connection, table: str, rows: List[Dict], skip_if_alr
if skip_if_already_exists:
if pk and isinstance(pk, str): # normal, single primary key
already_in_db: List[Dict] = get_objs_by_id(con, table, pk, [row[pk] for row in rows])
already_in_db_ids = [row[pk] for row in already_in_db]
already_in_db_ids = set([row[pk] for row in already_in_db])
rows = [row for row in rows if row[pk] not in already_in_db_ids]
elif pk and isinstance(pk, list): # composite key
already_in_db: List[Dict] = get_objs_by_composite_key(con, table, pk, rows)
Expand Down
Loading

0 comments on commit 7e71ee1

Please sign in to comment.