Skip to content

Commit

Permalink
Add unified_export_json
Browse files Browse the repository at this point in the history
  • Loading branch information
EhsanBitaraf committed Dec 31, 2023
1 parent 3e5f936 commit 4a50b61
Show file tree
Hide file tree
Showing 9 changed files with 328 additions and 171 deletions.
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
"python.testing.pytestEnabled": true,
"python.REPL.enableREPLSmartSend": false
}
8 changes: 5 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ All notable changes to this project will be documented in this file.
## v0.0.5 2023-12-28

### Task
- add `get_article_id_list_by_cstate` replace `get_article_pmid_list_by_cstate`
- add `get_article_by_id` replace `get_article_by_pmid`
- add `get_all_article_id_list` replace `get_all_article_pmid_list`

- move_state_forward may be error in TinyDB
- check all TinyDB

### Improvements
- Add unified_export_json
- add `get_article_id_list_by_cstate` replace `get_article_pmid_list_by_cstate`
- add `get_article_by_id` replace `get_article_by_pmid`
- add `get_all_article_id_list` replace `get_all_article_pmid_list`
- Add print_error in utils.general for unified Error printing
- Add Published, ArxivID, SourceBank field in Article

Expand Down
7 changes: 1 addition & 6 deletions triplea/service/repository/export/save_article.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
import json

from bson import ObjectId
from triplea.schemas.article import Article
from triplea.utils.general import JSONEncoder


class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, ObjectId):
return str(obj)
return super().default(obj)


def save_article2json(article: Article, output_file: str):
Expand Down
205 changes: 45 additions & 160 deletions triplea/service/repository/export/triplea_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
import sys

import click
from triplea.config.settings import SETTINGS
from triplea.service.click_logger import logger
from triplea.schemas.article import Article
from triplea.service.repository.export.unified_export_json import json_converter_01
import triplea.service.repository.persist as persist
import traceback

from triplea.utils.general import safe_csv
from triplea.utils.general import print_error, safe_csv


def export_triplea_json(proccess_bar=False, limit_sample=0) -> str:
Expand Down Expand Up @@ -196,8 +198,9 @@ def export_triplea_csvs_in_relational_mode_save_file( # noqa: C901

total_article_in_current_state = len(l_id)

bar = click.progressbar(length=len(l_id), show_pos=True, show_percent=True)
max_refresh_point = 500
if proccess_bar:
bar = click.progressbar(length=len(l_id), show_pos=True, show_percent=True)

refresh_point = 0
csv = ""
authors_csv = (
Expand Down Expand Up @@ -252,7 +255,7 @@ def export_triplea_csvs_in_relational_mode_save_file( # noqa: C901
try:
n = n + 1

if refresh_point == max_refresh_point:
if refresh_point == SETTINGS.AAA_CLI_ALERT_POINT:
refresh_point = 0
if proccess_bar:
print()
Expand All @@ -262,7 +265,7 @@ def export_triplea_csvs_in_relational_mode_save_file( # noqa: C901
)
if proccess_bar is False:
bar.label = f"There are {str(total_article_in_current_state - n)} article(s) left " # noqa: E501
bar.update(max_refresh_point)
bar.update(SETTINGS.AAA_CLI_ALERT_POINT)
else:
refresh_point = refresh_point + 1

Expand All @@ -277,154 +280,40 @@ def export_triplea_csvs_in_relational_mode_save_file( # noqa: C901
updated_article = Article(**a.copy())
except Exception:
print()
print(logger.ERROR(f"Error in parsing article. PMID = {id}"))
print(logger.ERROR(f"Error in parsing article. ID = {id}"))
raise Exception("Article Not Parsed.")

title = ""
year = ""
publisher = ""
journal_issn = ""
journal_iso_abbreviation = ""
language = ""
publication_type = ""

if updated_article.Title is not None:
title = safe_csv(updated_article.Title)

try:
year = updated_article.OreginalArticle["PubmedArticleSet"][
"PubmedArticle"
]["MedlineCitation"]["Article"]["Journal"]["JournalIssue"]["PubDate"][
"Year"
]
except Exception:
try:
year = updated_article.OreginalArticle["PubmedArticleSet"][
"PubmedArticle"
]["MedlineCitation"]["Article"]["Journal"]["JournalIssue"][
"PubDate"
][
"MedlineDate"
]
except Exception:
year = "0"
# with open("sample.json", "w") as outfile:
# json.dump(updated_article.OreginalArticle, outfile)

publisher = safe_csv(updated_article.Journal)
try:
journal_issn = updated_article.OreginalArticle["PubmedArticleSet"][
"PubmedArticle"
]["MedlineCitation"]["Article"]["Journal"]["ISSN"]["#text"]
except Exception:
journal_issn = ""

journal_iso_abbreviation = updated_article.OreginalArticle[
"PubmedArticleSet"
]["PubmedArticle"]["MedlineCitation"]["Article"]["Journal"][
"ISOAbbreviation"
]
lang = updated_article.OreginalArticle["PubmedArticleSet"]["PubmedArticle"][
"MedlineCitation"
]["Article"]["Language"]
if isinstance(lang, list):
for lg in lang:
language = lg + ", " + language
language = language[:-1]
else:
language = lang
language = safe_csv(language)

p = updated_article.OreginalArticle["PubmedArticleSet"]["PubmedArticle"][
"MedlineCitation"
]["Article"]["PublicationTypeList"]["PublicationType"]
if isinstance(p, list):
for i in p:
chunk = i["#text"]
publication_type = chunk + ", " + publication_type
# publication_type = p[0]['#text']
publication_type = publication_type[:-1]
else:
publication_type = updated_article.OreginalArticle["PubmedArticleSet"][
"PubmedArticle"
]["MedlineCitation"]["Article"]["PublicationTypeList"][
"PublicationType"
][
"#text"
]

journal_iso_abbreviation = safe_csv(journal_iso_abbreviation)

publication_type = safe_csv(publication_type)

url = f"https://pubmed.ncbi.nlm.nih.gov/{updated_article.PMID}/"

if updated_article.Abstract is None:
abstract = ""
else:
if updated_article.Abstract.__contains__(","):
abstract = updated_article.Abstract.replace('"', " ")
abstract = f'"{abstract}"'
else:
abstract = updated_article.Abstract
doi = updated_article.DOI
pmid = updated_article.PMID
state = updated_article.State

citation = 0
if updated_article.CitedBy is not None:
citation = len(updated_article.CitedBy)

if updated_article.Authors is not None:
# -------------------------------------------------Parsing--------------------------------------------------
article= json_converter_01(updated_article)
title = safe_csv(article['title'])
year = article['year']
publisher = safe_csv(article['publisher'])
journal_issn = article['journal_issn']
journal_iso_abbreviation = safe_csv(article['journal_iso_abbreviation'])
language = safe_csv(article['language'])
publication_type = safe_csv(article['publication_type'])
url = article['url']
abstract= safe_csv(article['abstract'])
doi = article['doi']
pmid = article['pmid']
state = article['state']
citation = article['citation_count']


if article['authors'] is not None:
for au in updated_article.Authors:
if au.Affiliations is not None:
first_aff = au.Affiliations[0]
department = ""
hospital = ""
institute = ""
country = ""
university = ""
center = ""

location = ""
email = ""
zipcode = ""

if first_aff.Structural is not None:
for s in first_aff.Structural:
if "department" in s:
department = s["department"]
elif "hospital" in s:
hospital = s["hospital"]
elif "institute" in s:
institute = s["institute"]
elif (
"institution" in s
): # aff.ParseMethod = AffiliationParseMethod.TITIPATA_API # noqa: E501
institute = s["institution"]
elif "country" in s:
country = s["country"]
elif "university" in s:
university = s["university"]
elif "center" in s:
center = s["center"]

elif (
"location" in s
): # aff.ParseMethod = AffiliationParseMethod.TITIPATA_API # noqa: E501
location = s["location"]
elif (
"email" in s
): # aff.ParseMethod = AffiliationParseMethod.TITIPATA_API # noqa: E501
email = s["email"]
elif (
"zipcode" in s
): # aff.ParseMethod = AffiliationParseMethod.TITIPATA_API # noqa: E501
zipcode = s["zipcode"]

else:
print(s)
aff = first_aff.Text
if 'affiliations' in au:
first_aff = au['affiliations'][0]
department = first_aff['department']
hospital = first_aff['hospital']
institute = first_aff['institute']
country = first_aff['country']
university = first_aff['university']
center = first_aff['center']
location = first_aff['location']
email = first_aff['email']
zipcode = first_aff['zipcode']
aff = first_aff['text']
else:
aff = None

Expand All @@ -435,13 +324,13 @@ def export_triplea_csvs_in_relational_mode_save_file( # noqa: C901
+ "\n"
)

if updated_article.Keywords is not None:
for k in updated_article.Keywords:
if 'keywords' in article:
for k in article['keywords']:
if k is not None:
keywords_csv = keywords_csv + f"{n},{safe_csv(k.Text)}" + "\n" # noqa: E501

if updated_article.Topics is not None:
for topic in updated_article.Topics:
if 'topics' in article:
for topic in article['topics']:
if topic is not None:
topics_csv = (
topics_csv
Expand All @@ -456,7 +345,7 @@ def export_triplea_csvs_in_relational_mode_save_file( # noqa: C901
)

if proccess_bar:
bar.label = "Article " + updated_article.PMID + " , exported."
bar.label = "Article " + id + " , exported."
bar.update(1)

# ------------------Write to file ---------------------------------
Expand All @@ -473,13 +362,9 @@ def export_triplea_csvs_in_relational_mode_save_file( # noqa: C901
topics_csv = ""

except Exception:
exc_type, exc_value, exc_tb = sys.exc_info()
print()
print(f"line : {exc_tb.tb_lineno}")
print(f"PMID : {updated_article.PMID}")
logger.ERROR(f"Error {exc_type}")
logger.ERROR(f"Error {exc_value}")
traceback.print_tb(exc_tb)
print_error()

f_main.close()
f_authors.close()
Expand Down
24 changes: 24 additions & 0 deletions triplea/service/repository/export/unified_export_json/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@




from triplea.schemas.article import Article, SourceBankType
from triplea.service.repository.export.unified_export_json.arxiv import (
_json_converter_01_arxiv)
from triplea.service.repository.export.unified_export_json.pubmed import (
_json_converter_01_pubmed)


def json_converter_01(article:Article):
# Check SourceBank
if article.SourceBank is None:
# This is Pubmed
json_article = _json_converter_01_pubmed(article)
elif article.SourceBank == SourceBankType.PUBMED:
json_article = _json_converter_01_pubmed(article)
elif article.SourceBank == SourceBankType.ARXIV:
json_article = _json_converter_01_arxiv(article)
else:
raise NotImplementedError

return json_article
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from triplea.schemas.article import Article


def _json_converter_01_arxiv(article:Article):
raise NotImplementedError
Loading

0 comments on commit 4a50b61

Please sign in to comment.