diff --git a/.github/workflows/python-flake.yml b/.github/workflows/python-flake.yml index 3422188..f03842c 100644 --- a/.github/workflows/python-flake.yml +++ b/.github/workflows/python-flake.yml @@ -29,6 +29,7 @@ jobs: pip install flake8 - name: Lint with flake8 run: | + flake8 ./triplea/ --count --no-show-source --statistics flake8 ./triplea/ --count --exit-zero --max-complexity=20 --max-line-length=90 --no-show-source --statistics - name: Fail if there are linting errors if: ${{ failure() }} diff --git a/CHANGELOG.md b/CHANGELOG.md index b7583ab..6717a79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,17 @@ # Changelog All notable changes to this project will be documented in this file. + +## v0.0.4 2023-10-14 +### Improvements +- Add FlagAffiliationMining_TITIPATA from Api +- Add ParseMethod field in Affiliation + + +### Bug Fixes +- Fix GitHub Action `pyhton-flake` + + ## v0.0.3 - 2023-09-27 ### Improvements - Add micro version diff --git a/README.md b/README.md index a308ddc..1ad979f 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,10 @@ https://badge.fury.io/for/py/Triple-a --> - Dynamic citations deep definition for meta data fetch - Network Analysis (Per Node/Overall Graph) - Import bibliography file +- Use for [Bibliometric Analysis](https://researchguides.uic.edu/bibliometrics) + + + # How to use ## Setup @@ -268,8 +272,8 @@ List of Custom Pipeline |------|--------|-----------| |Triple extraction from article abstract|FlagExtractKG|| |Topic extraction from article abstract|FlagExtractTopic|| -|Convert Affiliation text to structural data|FlagAffiliationMining| | - +|Convert Affiliation text to structural data|FlagAffiliationMining|This is simple way for parse Affiliation text | +|Convert Affiliation text to structural data|FlagAffiliationMining_Titipata|use [Titipat Achakulvisut Repo](https://github.com/titipata/affiliation_parser) for parsing Affiliation text| #### NER Article Title You can try the NER method to extract the major topic of the article's title by using the following command. This command is independent and is used for testing and is not stored in the Arepo. diff --git a/docs/code-quality.md b/docs/code-quality.md index d24d9e1..18363a6 100644 --- a/docs/code-quality.md +++ b/docs/code-quality.md @@ -86,7 +86,7 @@ Flake8 Rules https://www.flake8rules.com/ -Sample command: +## Sample command: ``` flake8 --show-source .\triplea\cli\main.py @@ -158,4 +158,200 @@ flake8 --config=.flake8 --output-file out-flake8.txt .\triplea\cli\ ``` black .\triplea\cli\ -``` \ No newline at end of file +``` + + +## History + +### All + +```sh +flake8 --config=.flake8 --count --output-file out-flake8.txt .\triplea\ --no-show-source --statistics +``` + + +2023-10-15 +``` +4 C901 'export_rayyan_csv' is too complex (21) +24 E116 unexpected indentation (comment) +6 E117 over-indented +3 E122 continuation line missing indentation or outdented +3 E125 continuation line with same indent as next logical line +14 E127 continuation line over-indented for visual indent +8 E128 continuation line under-indented for visual indent +1 E131 continuation line unaligned for hanging indent +17 E201 whitespace after '(' +14 E202 whitespace before ')' +38 E203 whitespace before ':' +1 E211 whitespace before '(' +6 E221 multiple spaces before operator +7 E222 multiple spaces after operator +24 E225 missing whitespace around operator +43 E231 missing whitespace after ',' +4 E251 unexpected spaces around keyword / parameter equals +4 E252 missing whitespace around parameter equals +15 E261 at least two spaces before inline comment +5 E262 inline comment should start with '# ' +24 E265 block comment should start with '# ' +1 E271 multiple spaces after keyword +1 E301 expected 1 blank line, found 0 +32 E302 expected 2 blank lines, found 1 +84 E303 too many blank lines (3) +1 E402 module level import not at top of file +143 E501 line too long (112 > 90 characters) +1 E711 comparison to None should be 'if cond is None:' +4 E712 comparison to False should be 'if cond is False:' or 'if not cond:' +11 E722 do not use bare 'except' +1 E741 ambiguous variable name 'l' +78 F401 'json' imported but unused +1 F821 undefined name 'topics' +29 F841 local variable 'output_data' is assigned to but never used +63 W291 trailing whitespace +13 W292 no newline at end of file +84 W293 blank line contains whitespace +11 W391 blank line at end of file +``` + +select +```sh +flake8 --config=.flake8 --count --output-file out-flake8.txt .\triplea\ --no-show-source --statistics --select E501 +``` + +ignore +```sh +flake8 --config=.flake8 --count --output-file out-flake8.txt .\triplea\ --no-show-source --statistics --ignore E501,W503,E722 +``` + + +```sh +black .\triplea\the_private_backyard.py + +black .\triplea\the_private_backyard1.py + +black .\triplea\the_private_backyard2.py + +black .\triplea\the_private_backyard3.py + +black .\triplea\the_private_backyard3.py + +black .\triplea\the_private_backyard_mongodb.py + +``` + +2023-10-16 +``` +4 C901 'export_rayyan_csv' is too complex (21) +1 E303 too many blank lines (3) +82 E501 line too long (110 > 90 characters) +10 E722 do not use bare 'except' +1 E741 ambiguous variable name 'l' +1 F821 undefined name 'topics' +5 F841 local variable 'city' is assigned to but never used +104 +``` +104 + +### cli + +```sh +black .\triplea\cli\ +``` + +```sh +flake8 --config=.flake8 --output-file out-flake8.txt .\triplea\cli\ --no-show-source --statistics +``` + +0 + +### client + +```sh +black .\triplea\client\ +``` + +```sh +flake8 --config=.flake8 --output-file out-flake8.txt .\triplea\client\ --no-show-source --statistics +``` + +0 + +### config + + +```sh +black .\triplea\config\ +``` + +```sh +flake8 --config=.flake8 --output-file out-flake8.txt .\triplea\config\ --no-show-source --statistics +``` + +0 + +### db + +```sh +black .\triplea\db\ +``` + +```sh +flake8 --config=.flake8 --output-file out-flake8.txt .\triplea\db\ --no-show-source --statistics +``` +0 + + +### schemas + + +```sh +black .\triplea\schemas\ +``` + +```sh +flake8 --config=.flake8 --output-file out-flake8.txt .\triplea\schemas\ --no-show-source --statistics +``` + +0 + +### service + +```sh +black .\triplea\service\ +``` + +```sh +flake8 --config=.flake8 --count --output-file out-flake8.txt .\triplea\service\ --no-show-source --statistics --ignore E501,W503,E722 + +flake8 --config=.flake8 --count --output-file out-flake8.txt .\triplea\service\ --no-show-source --statistics + +``` + +510 +137 +113 + +``` +4 C901 'export_rayyan_csv' is too complex (21) +1 E303 too many blank lines (3) +1 E402 module level import not at top of file +82 E501 line too long (110 > 90 characters) +2 E712 comparison to False should be 'if cond is False:' or 'if not cond:' +10 E722 do not use bare 'except' +1 E741 ambiguous variable name 'l' +5 F401 'networkx.classes.function.is_directed' imported but unused +1 F821 undefined name 'topics' +6 F841 local variable 'elapsed' is assigned to but never used +``` + +### utils + + +```sh +black .\triplea\utils\ +``` + +```sh +flake8 --config=.flake8 --output-file out-flake8.txt .\triplea\utils\ --no-show-source --statistics +``` + +0 \ No newline at end of file diff --git a/docs/related-work.md b/docs/related-work.md index 1b07a77..1bf07c9 100644 --- a/docs/related-work.md +++ b/docs/related-work.md @@ -90,6 +90,26 @@ https://pubmed.ncbi.nlm.nih.gov/30678631/ https://github.com/NeuroMorphoOrg/PaperBot +## Paperfetcher +Paperfetcher: A tool to automate handsearching and citation searching for systematic reviews + +https://onlinelibrary.wiley.com/doi/epdf/10.1002/jrsm.1604 + +https://paperfetcher.github.io/ + +https://github.com/paperfetcher/paperfetcher-web-app + +https://github.com/paperfetcher/paperfetcher + + +## paperscraper +![Awesome Badges](https://img.shields.io/badge/badges-awesome-green.svg) +Tools to scrape publication metadata from pubmed, arxiv, medrxiv and chemrxiv. + +Since v0.2.4 paperscraper also supports scraping PDF files directly! Thanks to @daenuprobst for suggestions! + +https://github.com/PhosphorylatedRabbits/paperscraper + ## bibliometrix: An R-tool for comprehensive science mapping analysis https://www.sciencedirect.com/science/article/abs/pii/S1751157717300500 diff --git a/out-flake8.txt b/out-flake8.txt index 234b097..e69de29 100644 --- a/out-flake8.txt +++ b/out-flake8.txt @@ -1,46 +0,0 @@ -.\triplea\cli\export_graph.py:121: [E127] continuation line over-indented for visual indent -1 E127 continuation line over-indented for visual indent -.\triplea\cli\export_graph.py:121: [E127] continuation line over-indented for visual indent - "graphjson", - ^ -1 E127 continuation line over-indented for visual indent -.\triplea\cli\export_graph.py:121: [E128] continuation line under-indented for visual indent - "graphjson", - ^ -.\triplea\cli\export_graph.py:122: [E128] continuation line under-indented for visual indent - "gson", - ^ -.\triplea\cli\export_graph.py:123: [E128] continuation line under-indented for visual indent - "gpickle", - ^ -.\triplea\cli\export_graph.py:124: [E128] continuation line under-indented for visual indent - "graphml", - ^ -.\triplea\cli\export_graph.py:125: [E128] continuation line under-indented for visual indent - "gexf"]), - ^ -5 E128 continuation line under-indented for visual indent -.\triplea\cli\export_graph.py:121: [E127] continuation line over-indented for visual indent - "graphjson", - ^ -1 E127 continuation line over-indented for visual indent -.\triplea\cli\export_graph.py:121: [E128] continuation line under-indented for visual indent - "graphjson", - ^ -.\triplea\cli\export_graph.py:122: [E128] continuation line under-indented for visual indent - "gson", - ^ -.\triplea\cli\export_graph.py:123: [E128] continuation line under-indented for visual indent - "gpickle", - ^ -.\triplea\cli\export_graph.py:124: [E128] continuation line under-indented for visual indent - "graphml", - ^ -.\triplea\cli\export_graph.py:125: [E128] continuation line under-indented for visual indent - "gexf"]), - ^ -5 E128 continuation line under-indented for visual indent -.\triplea\cli\export_graph.py:123: [E127] continuation line over-indented for visual indent - "gpickle", - ^ -1 E127 continuation line over-indented for visual indent diff --git a/pyproject.toml b/pyproject.toml index 2500ac2..d9b719f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "triplea" -version = "0.0.3" +version = "0.0.4" license = "Apache-2.0" description = "Article Analysis Assistant" authors = ["Ehsan Bitaraf ", "Maryam Jafarpour "] diff --git a/triplea/cli/aaa.py b/triplea/cli/aaa.py index ded6841..b62236c 100644 --- a/triplea/cli/aaa.py +++ b/triplea/cli/aaa.py @@ -11,7 +11,7 @@ from triplea.cli.export_graph import export_graph # exportgraph # noqa: F401 from triplea.cli.export import export # export # noqa: F401 from triplea.cli.export_pretrain_llm import export_llm # exportllm # noqa: F401 -from triplea.cli.export_article import cli_export_article # export_article # noqa: F401 +from triplea.cli.export_article import cli_export_article # export_article # noqa: F401 from triplea.cli.visualize import visualize # visualize # noqa: F401 from triplea.cli.visualize import visualize_file # visualize_file # noqa: F401 from triplea.cli.analysis import analysis # analysis # noqa: F401 diff --git a/triplea/cli/config.py b/triplea/cli/config.py index 924e78e..21d752c 100644 --- a/triplea/cli/config.py +++ b/triplea/cli/config.py @@ -84,19 +84,27 @@ def configuration(command): f"""AAA_MONGODB_CONNECTION_URL= {AAA_MONGODB_CONNECTION_URL} \n""" ) - file.writelines(f"""AAA_MONGODB_DB_NAME= - {AAA_MONGODB_DB_NAME} \n""") + file.writelines( + f"""AAA_MONGODB_DB_NAME= + {AAA_MONGODB_DB_NAME} \n""" + ) elif TRIPLEA_DB_TYPE == "TinyDB": - file.writelines(f"""AAA_TINYDB_FILENAME= - {AAA_TINYDB_FILENAME} \n""") + file.writelines( + f"""AAA_TINYDB_FILENAME= + {AAA_TINYDB_FILENAME} \n""" + ) file.writelines(f"AAA_TPS_LIMIT={AAA_TPS_LIMIT} \n") file.writelines(f"AAA_PROXY_HTTP={AAA_PROXY_HTTP} \n") file.writelines(f"AAA_PROXY_HTTPS={AAA_PROXY_HTTPS} \n") - file.writelines(f"""AAA_REFF_CRAWLER_DEEP= - {AAA_REFF_CRAWLER_DEEP} \n""") - file.writelines(f"""AAA_CITED_CRAWLER_DEEP= - {AAA_CITED_CRAWLER_DEEP} \n""") + file.writelines( + f"""AAA_REFF_CRAWLER_DEEP= + {AAA_REFF_CRAWLER_DEEP} \n""" + ) + file.writelines( + f"""AAA_CITED_CRAWLER_DEEP= + {AAA_CITED_CRAWLER_DEEP} \n""" + ) else: raise NotImplementedError diff --git a/triplea/cli/export.py b/triplea/cli/export.py index 6f9c71a..3027e23 100644 --- a/triplea/cli/export.py +++ b/triplea/cli/export.py @@ -31,7 +31,7 @@ "--format", "-f", "format_type", - type=click.Choice(["csv", "json","csvs"]), + type=click.Choice(["csv", "json", "csvs"]), multiple=False, required=True, help="""Export article repository in specific format. @@ -40,7 +40,7 @@ json : csvs : Several csv files are created - and one-to-many relationships + and one-to-many relationships are maintained in them """, @@ -102,12 +102,11 @@ def export(export_type, format_type, output_file, proccess_bar, limit_sample): with open(output_file, "w", encoding="utf-8") as file1: file1.write(csv) elif format_type == "csvs": - repo_export.export_triplea_csvs_in_relational_mode_save_file(output_file,proccess_bar, limit_sample) + repo_export.export_triplea_csvs_in_relational_mode_save_file( + output_file, proccess_bar, limit_sample + ) elif format_type == "json": - json_str = repo_export.export_triplea_json( - proccess_bar, - limit_sample - ) + json_str = repo_export.export_triplea_json(proccess_bar, limit_sample) with open(output_file, "w", encoding="utf-8") as file1: file1.write(json_str) else: diff --git a/triplea/cli/export_article.py b/triplea/cli/export_article.py index 73c0428..b626341 100644 --- a/triplea/cli/export_article.py +++ b/triplea/cli/export_article.py @@ -1,4 +1,3 @@ -import json import sys import click from triplea.cli.main import cli @@ -7,7 +6,6 @@ from triplea.service.click_logger import logger - @cli.command("export_article", help="Export Article by identifier.") @click.option( "--idtype", @@ -36,7 +34,7 @@ "--format", "-f", "format_type", - type=click.Choice(["xml", "json","csv"]), + type=click.Choice(["xml", "json", "csv"]), multiple=False, required=True, help="""Export article repository in specific format. @@ -56,17 +54,14 @@ required=True, help="File name & path of output graph format.", ) -def cli_export_article( - identifier_type, identifier, format_type, output_file -): - if identifier_type == 'pmid': +def cli_export_article(identifier_type, identifier, format_type, output_file): + if identifier_type == "pmid": a = persist.get_article_by_pmid(identifier) if a is None: logger.ERROR("Not found.") sys.exit(1) return - output_data = a a_title = a["Title"] a_journal = a["Journal"] a_doi = a["DOI"] @@ -96,13 +91,12 @@ def cli_export_article( keywords = keywords + k["Text"] + ", " logger.INFO(f"Keywords: {keywords}") - if format_type == 'json': - save_articlestr2json(a,output_file) + if format_type == "json": + save_articlestr2json(a, output_file) - - elif identifier_type == 'pmcid': + elif identifier_type == "pmcid": raise NotImplementedError - elif identifier_type == 'doi': + elif identifier_type == "doi": raise NotImplementedError sys.exit(1) diff --git a/triplea/cli/export_graph.py b/triplea/cli/export_graph.py index ed3fd91..d14cb79 100644 --- a/triplea/cli/export_graph.py +++ b/triplea/cli/export_graph.py @@ -117,12 +117,7 @@ "--format", "-f", "format_type", - type=click.Choice(["graphdict", - "graphjson", - "gson", - "gpickle", - "graphml", - "gexf"]), + type=click.Choice(["graphdict", "graphjson", "gson", "gpickle", "graphml", "gexf"]), multiple=False, required=True, help="""Generate graph and export. @@ -245,8 +240,10 @@ def export_graph( l_edges.extend(graphdict["edges"]) else: - logger.ERROR(f"""Invalid value for - '--generate' / '-g': {generate_type}""") + logger.ERROR( + f"""Invalid value for + '--generate' / '-g': {generate_type}""" + ) sys.exit(1) print() diff --git a/triplea/cli/ner.py b/triplea/cli/ner.py index 7d07486..dd08315 100644 --- a/triplea/cli/ner.py +++ b/triplea/cli/ner.py @@ -1,5 +1,6 @@ from string import printable import click + # from triplea.service.nlp.ner import get_title_ner # Expire Module from triplea.service.click_logger import logger from triplea.cli.main import cli diff --git a/triplea/cli/pipeline.py b/triplea/cli/pipeline.py index 5c327ab..b69e478 100644 --- a/triplea/cli/pipeline.py +++ b/triplea/cli/pipeline.py @@ -16,6 +16,8 @@ def pipeline(name: str): go_extract_topic() elif name == "FlagAffiliationMining": go_affiliation_mining() + elif name == "FlagAffiliationMining_Titipata": + go_affiliation_mining(method="Titipata") else: raise NotImplementedError diff --git a/triplea/cli/visualize.py b/triplea/cli/visualize.py index 40a8d41..694716c 100644 --- a/triplea/cli/visualize.py +++ b/triplea/cli/visualize.py @@ -78,9 +78,7 @@ def visualize(generate_type, port): l_edges.extend(graphdict["edges"]) elif g_type == "article-topic": - graphdict = gextract.graph_extractor( - gextract.graph_extract_article_topic - ) + graphdict = gextract.graph_extractor(gextract.graph_extract_article_topic) l_nodes.extend(graphdict["nodes"]) l_edges.extend(graphdict["edges"]) @@ -92,9 +90,7 @@ def visualize(generate_type, port): l_edges.extend(graphdict["edges"]) elif g_type == "article-keyword": - graphdict = gextract.graph_extractor( - gextract.graph_extract_article_keyword - ) + graphdict = gextract.graph_extractor(gextract.graph_extract_article_keyword) l_nodes.extend(graphdict["nodes"]) l_edges.extend(graphdict["edges"]) @@ -106,21 +102,19 @@ def visualize(generate_type, port): l_edges.extend(graphdict["edges"]) elif g_type == "article-cited": - graphdict = gextract.graph_extractor( - gextract.graph_extract_article_cited - ) + graphdict = gextract.graph_extractor(gextract.graph_extract_article_cited) l_nodes.extend(graphdict["nodes"]) l_edges.extend(graphdict["edges"]) elif g_type == "country-authorship": - graphdict = gextract.graph_extractor( - gextract.graph_extract_article_country - ) + graphdict = gextract.graph_extractor(gextract.graph_extract_article_country) l_nodes.extend(graphdict["nodes"]) l_edges.extend(graphdict["edges"]) else: - logger.ERROR(f"""Invalid value for - '--generate' / '-g': {generate_type}""") + logger.ERROR( + f"""Invalid value for + '--generate' / '-g': {generate_type}""" + ) # print() # logger.DEBUG(f'Remove duplication in Nodes & Edges. ') @@ -136,7 +130,6 @@ def visualize(generate_type, port): httpd.serve_forever() - @cli.command("visualize_file", help="Visualize Graph File.") @click.argument( "file", @@ -149,12 +142,7 @@ def visualize(generate_type, port): "--format", "-f", "format_type", - type=click.Choice(["graphdict", - "graphjson", - "gson", - "gpickle", - "graphml", - "gexf"]), + type=click.Choice(["graphdict", "graphjson", "gson", "gpickle", "graphml", "gexf"]), multiple=False, required=True, help="""Generate graph and export. @@ -178,8 +166,7 @@ def visualize(generate_type, port): """, ) @click.option("--port", "-p", "port", default=8000, help="port") -def visualize_file(file,format_type, port): - +def visualize_file(file, format_type, port): if format_type == "graphdict": with open(file, "r") as f: graphdict = json.load(f) @@ -191,21 +178,20 @@ def visualize_file(file,format_type, port): elif format_type == "gpickle": raise NotImplementedError - + elif format_type == "graphml": raise NotImplementedError - + elif format_type == "gexf": raise NotImplementedError - + else: logger.ERROR(f"Invalid value for '--format' / '-f': {format_type}") sys.exit(1) - graphdatarefresh.refresh_interactivegraph(graphdict) graphdatarefresh.refresh_alchemy(graphdict) with socketserver.TCPServer(("", port), Handler) as httpd: logger.INFO(f"serving at http://localhost:{port} ....") - httpd.serve_forever() \ No newline at end of file + httpd.serve_forever() diff --git a/triplea/client/affiliation_parser.py b/triplea/client/affiliation_parser.py new file mode 100644 index 0000000..9fc02a6 --- /dev/null +++ b/triplea/client/affiliation_parser.py @@ -0,0 +1,57 @@ +from triplea.config.settings import SETTINGS +import requests +from triplea.service.click_logger import logger +from urllib.parse import quote + +session = requests.Session() + + +def parse_affiliation(text: str) -> list: + URL = f"{SETTINGS.AAA_TOPIC_EXTRACT_ENDPOINT}/affiliation" + + # # data to be sent to api + # PARAMS = { + # "text": text, + # } + + text_encode = quote(text) + url = f"{URL}/{text_encode}" + + headers = { + "User-Agent": SETTINGS.AAA_CLIENT_AGENT, + } + + # To use HTTP Basic Auth with your proxy, + # use the http://user:password@host.com/ syntax: + if SETTINGS.AAA_PROXY_HTTP is not None: + proxy_servers = { + "http": SETTINGS.AAA_PROXY_HTTP, + "https": SETTINGS.AAA_PROXY_HTTPS, + } + else: + proxy_servers = None + + # sending get request and saving the response as response object + try: + r = session.get(url=url, headers=headers, proxies=proxy_servers) + + except Exception: + raise Exception("Connection Error.") + + # extracting data in json format + try: + if r.status_code == 200: + data = r.json() + if "status" in data: + return data["result"] + else: + logger.ERROR("status not exist.") + raise + else: + logger.ERROR(f"ERROR : {r.status_code}") + logger.ERROR(f"Reason : {r.reason}") + + except Exception as ex: + logger.ERROR(f"Error : {ex}") + logger.ERROR(f"{type(r)} {r} ") + raise diff --git a/triplea/client/pubmed/__init__.py b/triplea/client/pubmed/__init__.py index 01fe550..c224231 100644 --- a/triplea/client/pubmed/__init__.py +++ b/triplea/client/pubmed/__init__.py @@ -4,6 +4,7 @@ import json from triplea.service.click_logger import logger + def get_article_list_from_pubmed(retstart: int, retmax: int, search_term: str) -> dict: """ This function takes in a search term, and returns a dictionary of the results of the search @@ -33,9 +34,7 @@ def get_article_list_from_pubmed(retstart: int, retmax: int, search_term: str) - # "maxdate" : "2023/09/17" } - headers = { - "User-Agent": SETTINGS.AAA_CLIENT_AGENT - } + headers = {"User-Agent": SETTINGS.AAA_CLIENT_AGENT} # To use HTTP Basic Auth with your proxy, use the http://user:password@host.com/ syntax: if SETTINGS.AAA_PROXY_HTTP is not None: @@ -76,9 +75,7 @@ def get_article_details_from_pubmed(PMID) -> dict: "retmode": "xml", } - headers = { - "User-Agent": SETTINGS.AAA_CLIENT_AGENT - } + headers = {"User-Agent": SETTINGS.AAA_CLIENT_AGENT} # To use HTTP Basic Auth with your proxy, use the http://user:password@host.com/ syntax: if SETTINGS.AAA_PROXY_HTTP is not None: @@ -107,9 +104,7 @@ def get_cited_article_from_pubmed(PMID) -> dict: "retmode": "json", } - headers = { - "User-Agent": SETTINGS.AAA_CLIENT_AGENT - } + headers = {"User-Agent": SETTINGS.AAA_CLIENT_AGENT} # To use HTTP Basic Auth with your proxy, use the http://user:password@host.com/ syntax: if SETTINGS.AAA_PROXY_HTTP is not None: @@ -134,12 +129,12 @@ def get_cited_article_from_pubmed(PMID) -> dict: for linkdb in link["linksetdbs"]: rd = linkdb["linkname"] if ( - rd == "pubmed_pubmed" or - rd == "pubmed_pubmed_alsoviewed" or - rd == "pubmed_pubmed_combined" or - rd == "pubmed_pubmed_five" or - rd == "pubmed_pubmed_reviews" or - rd == "pubmed_pubmed_reviews_five" + rd == "pubmed_pubmed" + or rd == "pubmed_pubmed_alsoviewed" + or rd == "pubmed_pubmed_combined" + or rd == "pubmed_pubmed_five" + or rd == "pubmed_pubmed_reviews" + or rd == "pubmed_pubmed_reviews_five" ): pass else: diff --git a/triplea/client/topic_extraction.py b/triplea/client/topic_extraction.py index e63a7ec..d86bad6 100644 --- a/triplea/client/topic_extraction.py +++ b/triplea/client/topic_extraction.py @@ -1,4 +1,3 @@ -import time from triplea.config.settings import SETTINGS import requests import json @@ -16,7 +15,7 @@ # URL = SETTINGS.AAA_TOPIC_EXTRACT_ENDPOINT - + # # data to be sent to api # data = { # "Text": text.replace("\n"," "), @@ -47,53 +46,47 @@ # # proxies=proxy_servers) # # except Exception: # # raise Exception("Connection Error.") - - - # async with httpx.AsyncClient() as client: - # response = await client.post(url=URL, - # data=json.dumps(data), - # headers=headers, - # proxies=proxy_servers) - # response.raise_for_status() - - # data = response.json() - # if 'status' in data: - # return data['r'] - # else: - # logger.ERROR('status not exist.') - # raise - - # # extracting data in json format - # try: - # data = r.json() - # if 'status' in data: - # return data['r'] - # else: - # logger.ERROR('status not exist.') - # raise - - # except Exception as ex: - # logger.ERROR(f"Error : {ex}") - # logger.ERROR(f"{type(r)} {r} ") - # raise - - -def extract_topic(text: str, - method: str, - top:int=10, - threshold:float=0) -> list: - - - URL = SETTINGS.AAA_TOPIC_EXTRACT_ENDPOINT - - + + +# async with httpx.AsyncClient() as client: +# response = await client.post(url=URL, +# data=json.dumps(data), +# headers=headers, +# proxies=proxy_servers) +# response.raise_for_status() + +# data = response.json() +# if 'status' in data: +# return data['r'] +# else: +# logger.ERROR('status not exist.') +# raise + +# # extracting data in json format +# try: +# data = r.json() +# if 'status' in data: +# return data['r'] +# else: +# logger.ERROR('status not exist.') +# raise + +# except Exception as ex: +# logger.ERROR(f"Error : {ex}") +# logger.ERROR(f"{type(r)} {r} ") +# raise + + +def extract_topic(text: str, method: str, top: int = 10, threshold: float = 0) -> list: + URL = f"{SETTINGS.AAA_TOPIC_EXTRACT_ENDPOINT}/topic" + # data to be sent to api data = { - "Text": text.replace("\n"," "), - "Method": method, - "Top": top, - "Threshold": threshold - } + "Text": text.replace("\n", " "), + "Method": method, + "Top": top, + "Threshold": threshold, + } headers = { "User-Agent": SETTINGS.AAA_CLIENT_AGENT, @@ -112,10 +105,7 @@ def extract_topic(text: str, # sending get request and saving the response as response object try: j_data = json.dumps(data) - r = session.post(url=URL, - data=j_data, - headers=headers, - proxies=proxy_servers) + r = session.post(url=URL, data=j_data, headers=headers, proxies=proxy_servers) except Exception: raise Exception("Connection Error.") @@ -123,10 +113,10 @@ def extract_topic(text: str, # extracting data in json format try: data = r.json() - if 'status' in data: - return data['r'] + if "status" in data: + return data["r"] else: - logger.ERROR('status not exist.') + logger.ERROR("status not exist.") raise except Exception as ex: diff --git a/triplea/client/triple_extraction.py b/triplea/client/triple_extraction.py index e0c72aa..31ba983 100644 --- a/triplea/client/triple_extraction.py +++ b/triplea/client/triple_extraction.py @@ -5,15 +5,12 @@ def extract_triple(text: str) -> list: + URL = SETTINGS.AAA_TOPIC_EXTRACT_ENDPOINT - - URL = SETTINGS.AAA_TOPIC_EXTRACT_ENDPOINT - - # data to be sent to api data = { - "text": text.replace("\n"," "), - } + "text": text.replace("\n", " "), + } headers = { "User-Agent": SETTINGS.AAA_CLIENT_AGENT, @@ -31,20 +28,19 @@ def extract_triple(text: str) -> list: # sending get request and saving the response as response object try: - r = requests.post(url=URL, - data=json.dumps(data), - headers=headers, - proxies=proxy_servers) + r = requests.post( + url=URL, data=json.dumps(data), headers=headers, proxies=proxy_servers + ) except Exception: raise Exception("Connection Error.") # extracting data in json format try: data = r.json() - if 'status' in data: - return data['result'] + if "status" in data: + return data["result"] else: - logger.ERROR('status not exist.') + logger.ERROR("status not exist.") raise except Exception as ex: diff --git a/triplea/config/settings.py b/triplea/config/settings.py index 80a847a..cefd0b4 100644 --- a/triplea/config/settings.py +++ b/triplea/config/settings.py @@ -1,8 +1,7 @@ import os import pathlib -from typing import List, Optional - -from pydantic import AnyHttpUrl, BaseSettings, EmailStr, validator, Field +from typing import Optional +from pydantic import BaseSettings from dotenv import load_dotenv import tomli @@ -13,9 +12,10 @@ load_dotenv(ENV_PATH_FILE, override=True) -with open('pyproject.toml', 'rb') as f: +with open("pyproject.toml", "rb") as f: pyproject = tomli.load(f) - version = pyproject['tool']['poetry']['version'] + version = pyproject["tool"]["poetry"]["version"] + class Settings(BaseSettings): # ---------------My Envirement Varable------------------------------- @@ -25,26 +25,27 @@ class Settings(BaseSettings): ) AAA_MONGODB_CONNECTION_URL: Optional[str] = os.getenv( "AAA_MONGODB_CONNECTION_URL", "mongodb://user:pass@127.0.0.1:27017/" - ) + ) AAA_MONGODB_DB_NAME: Optional[str] = os.getenv( - "AAA_MONGODB_DB_NAME","default-aaa-mongo-db" - ) + "AAA_MONGODB_DB_NAME", "default-aaa-mongo-db" + ) AAA_TPS_LIMIT: Optional[int] = os.getenv("AAA_TPS_LIMIT", 1) AAA_PROXY_HTTP: Optional[str] = os.getenv("AAA_PROXY_HTTP", "") AAA_PROXY_HTTPS: Optional[str] = os.getenv("AAA_PROXY_HTTPS", "") - AAA_REFF_CRAWLER_DEEP: Optional[int] = os.getenv( - "AAA_REFF_CRAWLER_DEEP", 1) - AAA_CITED_CRAWLER_DEEP: Optional[int] = os.getenv( - "AAA_CITED_CRAWLER_DEEP", 1) - + AAA_REFF_CRAWLER_DEEP: Optional[int] = os.getenv("AAA_REFF_CRAWLER_DEEP", 1) + AAA_CITED_CRAWLER_DEEP: Optional[int] = os.getenv("AAA_CITED_CRAWLER_DEEP", 1) + AAA_CLIENT_AGENT: Optional[str] = os.getenv( "AAA_CLIENT_AGENT", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0") - AAA_TOPIC_EXTRACT_ENDPOINT:Optional[str] = os.getenv( - "AAA_TOPIC_EXTRACT_ENDPOINT", - "http://localhost:8001/api/v1/topic/") - - VERSION : Optional[str] = version + '.002' # Change this micro version in the development process + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0", # noqa: E501 + ) + AAA_TOPIC_EXTRACT_ENDPOINT: Optional[str] = os.getenv( + "AAA_TOPIC_EXTRACT_ENDPOINT", "http://localhost:8001/api/v1/topic/" + ) + + VERSION: Optional[str] = ( + version + ".001" + ) # Change this micro version in the development process # class Config: # case_sensitive = True diff --git a/triplea/db/mongodb.py b/triplea/db/mongodb.py index 795654b..359c817 100644 --- a/triplea/db/mongodb.py +++ b/triplea/db/mongodb.py @@ -17,7 +17,7 @@ class DB_MongoDB(DataBase): col_edges = db["edges"] col_triple = db["triple"] -#region Article + # region Article def add_new_article(self, article: Article) -> int: article_json = json.loads( @@ -48,23 +48,13 @@ def get_article_pmid_list_by_state(self, state: int): return [] else: return new_la - + def get_article_pmid_list_by_cstate(self, state: int, tag_field: str): if state is None or state == 0: - myquery = { - "$or" : [ - { - tag_field : None - }, - { - tag_field : 0 - } - ] - } + myquery = {"$or": [{tag_field: None}, {tag_field: 0}]} else: myquery = {tag_field: state} - cursor = self.col_article.find(myquery, projection={"PMID": "$PMID", "_id": 0}) la = list(cursor) @@ -147,19 +137,19 @@ def get_article_group_by_state(self): ] return list(self.col_article.aggregate(pipeline)) -#region Extra Article Method + # region Extra Article Method - def change_flag_extract_topic(self,current_value,set_value): + def change_flag_extract_topic(self, current_value, set_value): myquery = {"FlagExtractTopic": current_value} sett = {"$set": {"FlagExtractTopic": set_value}} r = self.col_article.update_many(myquery, sett) return r -#endregion + # endregion -# endregion + # endregion -# region Node + # region Node def add_new_node(self, node: Node) -> int: node_json = json.loads( @@ -181,9 +171,9 @@ def get_all_node_count(self) -> int: def get_all_nodes(self): raise NotImplementedError -# endregion + # endregion -# region Edge + # region Edge def add_new_edge(self, edge: Edge) -> int: edge_json = json.loads( @@ -205,9 +195,9 @@ def get_all_edge_count(self) -> int: def get_all_edges(self): raise NotImplementedError -# endregion + # endregion -# region Triple + # region Triple def add_new_triple(self, edge: dict) -> int: triple_json = json.loads( json.dumps(edge, default=lambda o: o.__dict__, sort_keys=True, indent=4) @@ -215,8 +205,7 @@ def add_new_triple(self, edge: dict) -> int: result = self.col_triple.insert_one(triple_json) return result.inserted_id -# endregion - + # endregion def close(self): self.client.close diff --git a/triplea/db/tinydb.py b/triplea/db/tinydb.py index 8cee274..d27634c 100644 --- a/triplea/db/tinydb.py +++ b/triplea/db/tinydb.py @@ -33,12 +33,18 @@ def get_article_pmid_list_by_state(self, state: int): q = Query() l_pmid = [a.get("PMID") for a in self.db.search(q.State == state)] return l_pmid - - def get_article_pmid_list_by_cstate(self, state: int,tag_field: str): + + def get_article_pmid_list_by_cstate(self, state: int, tag_field: str): q = Query() - if state is None or state ==0: - # query = (Query().FlagAffiliationMining == 0) | (Query().FlagAffiliationMining == None) | (~Query().FlagAffiliationMining.exists()) - query = (Query()[tag_field] == 0) | (Query()[tag_field] == None) | (~Query()[tag_field].exists()) + if state is None or state == 0: + # query = (Query().FlagAffiliationMining == 0) + # | (Query().FlagAffiliationMining == None) + # | (~Query().FlagAffiliationMining.exists()) + query = ( + (Query()[tag_field] == 0) + | (Query()[tag_field] is None) + | (~Query()[tag_field].exists()) + ) l_pmid = [a.get("PMID") for a in self.db.search(query)] else: l_pmid = [a.get("PMID") for a in self.db.search(q[tag_field] == state)] @@ -85,14 +91,15 @@ def get_all_article_count(self) -> int: """ return len(self.db) + # region Extra Article Method -#region Extra Article Method - - def change_flag_extract_topic(self,current_value,set_value): + def change_flag_extract_topic(self, current_value, set_value): # Update the value of "FlagExtractTopic" from 0 to 1 - return self.db.update({'FlagExtractTopic': set_value}, Query().FlagExtractTopic == current_value) + return self.db.update( + {"FlagExtractTopic": set_value}, Query().FlagExtractTopic == current_value + ) -#endregion + # endregion def add_new_node(self, node: Node) -> int: node_json = json.loads( diff --git a/triplea/schemas/article.py b/triplea/schemas/article.py index 080087f..2f01e3f 100644 --- a/triplea/schemas/article.py +++ b/triplea/schemas/article.py @@ -1,6 +1,7 @@ from pydantic import BaseModel, Field from typing import Optional -from typing import Union +import enum + class NamedEntity(BaseModel): Label: Optional[str] = Field(description="") @@ -10,11 +11,18 @@ class NamedEntity(BaseModel): class Keyword(BaseModel): Text: Optional[str] = Field(description="") IS_Major: Optional[bool] = Field( - description="The MajorTopic attribute is set to True (for Yes) when the MeSH Heading/Keyword alone is a central concept of the article" + description="""The MajorTopic attribute is set to True (for Yes) + when the MeSH Heading/Keyword alone is + a central concept of the article""" ) IS_Mesh: Optional[bool] = Field(description="") +class AffiliationParseMethod(enum.IntEnum): + SIMPLE_PARSE = 1 + TITIPATA_API = 2 # https://github.com/titipata/affiliation_parser + + class Affiliation(BaseModel): HashID: Optional[str] Text: Optional[str] = Field(description="") @@ -26,13 +34,15 @@ class Affiliation(BaseModel): Part6: Optional[str] = Field(description="") Has_Extra: Optional[bool] = Field(description="") Structural: Optional[list[dict]] = Field(description="") - + ParseMethod: Optional[AffiliationParseMethod] = Field(description="") class Author(BaseModel): HashID: Optional[str] LastName: Optional[str] = Field( - description="contains the surname or the single name used by an individual, even if that single name is not considered to be a surname" + description="""contains the surname or the single name used by + an individual, even if that single name + is not considered to be a surname""" ) ForeName: Optional[str] = Field( description="contains the remainder of name except for suffix" @@ -45,24 +55,24 @@ class Author(BaseModel): class Article(BaseModel): PMID: Optional[str] = Field( description="""the PubMed (NLM database that incorporates MEDLINE) - unique identifier, is a 1 to 8-digit accession number + unique identifier, is a 1 to 8-digit accession number with no leading zeros.""" ) DOI: Optional[str] = Field(description="") PMC: Optional[str] = Field( - description="""This is a unique reference number or identifier + description="""This is a unique reference number or identifier that is assigned to every article that is accepted into PMC.""" ) Title: Optional[str] = Field( - description="""Article Title contains the entire title of + description="""Article Title contains the entire title of the journal article. Article Title is always in English; those titles originally published in a non-English language - and translated for Article Title are enclosed + and translated for Article Title are enclosed in square brackets.""" ) Journal: Optional[str] = Field( - description="""The full journal title - (taken from NLM cataloging data following NLM rules + description="""The full journal title + (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.""" ) Authors: Optional[list[Author]] = Field(description="") @@ -78,7 +88,6 @@ class Article(BaseModel): ReferenceCrawlerDeep: Optional[int] = Field(description="") CiteCrawlerDeep: Optional[int] = Field(description="") NamedEntities: Optional[list[NamedEntity]] = Field(description="") - FlagExtractKG : Optional[int] = Field(description="") - FlagAffiliationMining : Optional[int] = Field(description="") + FlagExtractKG: Optional[int] = Field(description="") + FlagAffiliationMining: Optional[int] = Field(description="") FlagExtractTopic: Optional[int] = Field(description="") - diff --git a/triplea/schemas/node.py b/triplea/schemas/node.py index e69c224..04e37e1 100644 --- a/triplea/schemas/node.py +++ b/triplea/schemas/node.py @@ -4,7 +4,8 @@ # custom_encoder = lambda obj: dict(_type=type(obj).__name__, **obj.dict()) -# It creates a class called Node with the following attributes: Type, Identifier, and Name. +# It creates a class called Node with the following attributes: +# Type, Identifier, and Name. class Node(BaseModel): Type: Optional[str] = Field(description="") Identifier: Optional[str] = Field(description="") @@ -15,7 +16,9 @@ class Node(BaseModel): # sort_keys=True, indent=4) def json(self): - return {"Type": self.Type, "Identifier": self.Identifier, "Name": self.Name} + return {"Type": self.Type, + "Identifier": self.Identifier, + "Name": self.Name} # class Config: # json_encoders = { @@ -23,7 +26,8 @@ def json(self): # } -# The Edge class is a model that has four fields: HashID, SourceID, DestinationID, and Type +# The Edge class is a model that has four fields: +# HashID, SourceID, DestinationID, and Type class Edge(BaseModel): HashID: Optional[str] = Field(description="") SourceID: Optional[str] = Field(description="") diff --git a/triplea/service/graph/analysis/diameter.py b/triplea/service/graph/analysis/diameter.py index ef81390..5a4c78d 100644 --- a/triplea/service/graph/analysis/diameter.py +++ b/triplea/service/graph/analysis/diameter.py @@ -1,11 +1,11 @@ - import networkx as nx + def graph_diameter(G): """ This function calculates the diameter of a network graph using the eccentricity of each node. - + :param G: a networkx graph object representing a network :return: the diameter of the input graph G. """ diff --git a/triplea/service/graph/analysis/ganalysis.py b/triplea/service/graph/analysis/ganalysis.py index f176dfd..c9a9984 100644 --- a/triplea/service/graph/analysis/ganalysis.py +++ b/triplea/service/graph/analysis/ganalysis.py @@ -5,7 +5,7 @@ import matplotlib.pyplot as plt import pandas as pd from netwulf import visualize -from networkx.classes.function import is_directed +# from networkx.classes.function import is_directed def visualize_and_grouping(G): @@ -32,6 +32,7 @@ def sorted_average_neighbor_degree(G) -> pd.Series: dcs = dcs.sort_values(ascending=False) return dcs + # def sorted_in_degree(G)->pd.Series: # # NetworkX provides a function for us to calculate degree centrality conveniently: # dcs = pd.Series(G.in_degree()) @@ -101,6 +102,7 @@ def get_top_keys(dictionary, top): top = {k: sort_items[k] for k in list(sort_items)[:top]} return top + def get_avg_shortest_path_length_per_node(G): """ Calculate the average shortest-path length for each node in the graph. @@ -109,7 +111,8 @@ def get_avg_shortest_path_length_per_node(G): G (networkx.Graph): The input graph. Returns: - pandas.Series: A series containing the average shortest-path length for each node, sorted in descending order. + pandas.Series: A series containing the average shortest-path length + for each node, sorted in descending order. """ # Calculate the average shortest-path length for each node @@ -118,15 +121,19 @@ def get_avg_shortest_path_length_per_node(G): # Store the average shortest-path length for each node in a list of tuples ll = [] for node in avg_shortest_path_lengths: - avg_shortest_path_length = sum(avg_shortest_path_lengths[node].values()) / (len(G) - 1) + avg_shortest_path_length = sum(avg_shortest_path_lengths[node].values()) / ( + len(G) - 1 + ) ll.append((node, avg_shortest_path_length)) - # Convert the list of tuples to a pandas Series and sort it in descending order + # Convert the list of tuples to a pandas Series + # and sort it in descending order dcs = pd.Series(dict(ll)) dcs = dcs.sort_values(ascending=False) return dcs + def get_clustering_coefficient_per_node(G): # Calculate the clustering coefficient for each node s = {} @@ -138,14 +145,14 @@ def get_clustering_coefficient_per_node(G): else: num_connected = 0 for i in range(len(neighbors)): - for j in range(i+1, len(neighbors)): + for j in range(i + 1, len(neighbors)): if G.has_edge(neighbors[i], neighbors[j]): num_connected += 1 - cc = num_connected / (len(neighbors) * (len(neighbors)-1) / 2) + cc = num_connected / (len(neighbors) * (len(neighbors) - 1) / 2) # print(f"Node {node}: {cc}") s[node] = cc - + dcs = pd.Series(s) dcs = dcs.sort_values(ascending=False) - return dcs \ No newline at end of file + return dcs diff --git a/triplea/service/graph/analysis/info.py b/triplea/service/graph/analysis/info.py index c53e391..a167197 100644 --- a/triplea/service/graph/analysis/info.py +++ b/triplea/service/graph/analysis/info.py @@ -2,6 +2,7 @@ from networkx import is_directed import time + def info(G, format="stdout"): start_time = time.time() graph_type = "" @@ -26,7 +27,6 @@ def info(G, format="stdout"): diameter = nx.diameter(G) num_components = nx.number_connected_components(G) - density = nx.density(G) transitivity = nx.transitivity(G) number_of_edges = nx.number_of_edges(G) @@ -36,21 +36,21 @@ def info(G, format="stdout"): dag_longest_path_length = nx.dag_longest_path_length(G) except Exception: # Graph contains a cycle or graph changed during iteration - dag_longest_path_length = 'NaN' + dag_longest_path_length = "NaN" average_clustering = nx.average_clustering(G) degree_assortativity_coefficient = nx.degree_assortativity_coefficient(G) - + try: radius = nx.algorithms.distance_measures.radius(G) except Exception as ex: - radius = f'NaN {ex}' + radius = f"NaN {ex}" end_time = time.time() elapsed_time = end_time - start_time report_time = start_time - - if format=="stdout": + + if format == "stdout": print(f"Report Time : {report_time}") print(f"Elapsed Time Calculation Report : {elapsed_time}") print(f"Graph Type: {graph_type}") @@ -69,7 +69,7 @@ def info(G, format="stdout"): print(f"WCC: {wcc}") print(f"Reciprocity : {reciprocity}") print(f"Graph Diameter : {diameter}") - print(f"Number of Components : {num_components}") + print(f"Number of Components : {num_components}") # bet_cen = nx.betweenness_centrality(G) # clo_cen = nx.closeness_centrality(G) @@ -77,32 +77,32 @@ def info(G, format="stdout"): # print(f'Graph Betweenness Centrality: {get_top_keys(bet_cen,1)}') # print(f'Graph Closeness Centrality: {get_top_keys(clo_cen,1)}') # print(f'Graph Eigenvector Centrality : {get_top_keys(eig_cen, 1)}') - elif format=="json": + elif format == "json": data = { - "Report Time" : report_time, - "Elapsed Time Calculation Report" : elapsed_time, - "Graph Type" : graph_type, - "Graph Nodes" : number_of_nodes, - "Graph Edges" : number_of_edges, - "Graph Average Degree" : avg_deg, - "Graph Density" : density, - "Graph Transitivity" : transitivity, - "Graph max path length" : dag_longest_path_length, - "Graph Average Clustering Coefficient" : average_clustering, - "Graph Degree Assortativity Coefficient" : degree_assortativity_coefficient, - "Graph Radius" : radius, - "SCC" : scc, - "WCC" : wcc, - "Reciprocity" : reciprocity, - "Graph Diameter" : diameter, - "Number of Components" : num_components, + "Report Time": report_time, + "Elapsed Time Calculation Report": elapsed_time, + "Graph Type": graph_type, + "Graph Nodes": number_of_nodes, + "Graph Edges": number_of_edges, + "Graph Average Degree": avg_deg, + "Graph Density": density, + "Graph Transitivity": transitivity, + "Graph max path length": dag_longest_path_length, + "Graph Average Clustering Coefficient": average_clustering, + "Graph Degree Assortativity Coefficient": degree_assortativity_coefficient, + "Graph Radius": radius, + "SCC": scc, + "WCC": wcc, + "Reciprocity": reciprocity, + "Graph Diameter": diameter, + "Number of Components": num_components, } return data - elif format=="string": + elif format == "string": rep = "" rep += f"Report Time : {report_time}" - rep +=f"Elapsed Time Calculation Report : {elapsed_time}" - rep +=f"Graph Type: {graph_type}" + rep += f"Elapsed Time Calculation Report : {elapsed_time}" + rep += f"Graph Type: {graph_type}" rep += f"Graph Nodes: {number_of_nodes}" rep += f"Graph Edges: {number_of_edges}" rep += f"Graph Average Degree : {avg_deg}" diff --git a/triplea/service/graph/analysis/neighbor_number.py b/triplea/service/graph/analysis/neighbor_number.py index f49a916..3020011 100644 --- a/triplea/service/graph/analysis/neighbor_number.py +++ b/triplea/service/graph/analysis/neighbor_number.py @@ -1,5 +1,3 @@ - - def average_neighbor_number(G) -> float: """ For each node in the graph, count the number of neighbors and calculate average neighbor number for graph. @@ -18,4 +16,3 @@ def average_neighbor_number(G) -> float: sum_neighbors = sum_neighbors + num_neighbors return sum_neighbors / i - diff --git a/triplea/service/graph/export/export.py b/triplea/service/graph/export/export.py index 136a452..565ce6f 100644 --- a/triplea/service/graph/export/export.py +++ b/triplea/service/graph/export/export.py @@ -152,7 +152,8 @@ def export_gpickle_from_graphdict(graphdict: dict, filename: str): def export_gexf_from_arepo(filename: str): """ - It read article repository and extract node & edge from it, and then saves it in the [gexf format](https://gexf.net/) + It read article repository and extract node & edge from it, + and then saves it in the [gexf format](https://gexf.net/) """ G = export_networkX_from_arepo() # saving graph created above in gexf format @@ -163,31 +164,35 @@ def export_gexf_from_graphdict(graphdict: dict, filename: str): """ It takes a graph dictionary and exports it as a GEXF file - :param graphdict: a dictionary of dictionaries, where the keys are the nodes and the values are - dictionaries of the nodes' neighbors and the weights of the edges between them + :param graphdict: a dictionary of dictionaries, + where the keys are the nodes and the values are + dictionaries of the nodes' neighbors and the weights of the edges + between them :type graphdict: dict :param filename: the name of the file you want to save the graph as :type filename: str """ G = export_networkx_from_graphdict(graphdict) - nx.write_gexf(G, filename ) + nx.write_gexf(G, filename) def export_graphml_from_arepo(filename: str): """ - It read article repository and extract node & edge from it and exports it as a [graphml file](http://graphml.graphdrawing.org/) + It read article repository and extract node & edge from it + and exports it as a [graphml file](http://graphml.graphdrawing.org/) :param filename: the name of the file you want to save the graphml file as :type filename: str """ G = export_networkX_from_arepo() # saving graph created above in graphml format - nx.write_graphml(G, filename ) + nx.write_graphml(G, filename) def export_graphml_from_networkx(G: nx.Graph, filename: str): """ - It takes a networkx graph and a filename, and exports the graph to a graphml file with the given + It takes a networkx graph and a filename, and exports the graph + to a graphml file with the given filename :param G: the networkx graph object @@ -271,7 +276,8 @@ def export_networkx_from_graphdict( """ It takes a graph dictionary and returns a networkx graph - :param graphdict: The graph dictionary that you want to convert to a networkx graph + :param graphdict: The graph dictionary that you want to convert + to a networkx graph :param graph_type: Optional[str] = 'directed', defaults to directed :type graph_type: Optional[str] (optional) :return: A networkx graph object @@ -292,7 +298,7 @@ def export_networkx_from_graphdict( return G -# #------------------------------------------------------------------------------------------------------------- +# #---------------------------------------------------------------------------- def export_networkX( nodes: list[Node], edges: list[Edge], graph_type: Optional[str] = "directed" ): diff --git a/triplea/service/graph/extract/__init__.py b/triplea/service/graph/extract/__init__.py index 838ae71..a25e9dc 100644 --- a/triplea/service/graph/extract/__init__.py +++ b/triplea/service/graph/extract/__init__.py @@ -15,7 +15,9 @@ from triplea.service.graph.extract.keyword import graph_extract_article_keyword from triplea.service.graph.extract.reference import graph_extract_article_reference from triplea.service.graph.extract.cited import graph_extract_article_cited -from triplea.service.graph.extract.country_based_co_authorship import graph_extract_article_country +from triplea.service.graph.extract.country_based_co_authorship import ( + graph_extract_article_country, +) __all__ = [ @@ -197,7 +199,6 @@ def graph_extractor( else: return {"nodes": l_nodes, "edges": l_edges} - if article is not None: # data = _extract_article_topic(article) data = func(article) @@ -223,7 +224,7 @@ def graph_extractor( def graph_extractor_all_entity( state: Optional[int] = None, limit_node: Optional[int] = 0, - remove_duplicate: Optional[bool] = True + remove_duplicate: Optional[bool] = True, ): """ It takes a list of articles, extracts the graph from each article, and then combines all the graphs @@ -276,12 +277,13 @@ def graph_extractor_all_entity( logger.DEBUG("Remove duplication in Nodes & Edges. ") n = Emmanuel(l_nodes) e = Emmanuel(l_edges) - logger.DEBUG(f"Final {len(n)} Nodes & {len(e)} Edges Extracted.") + logger.DEBUG( + f"Final {len(n)} Nodes & {len(e)} Edges Extracted." + ) return {"nodes": n, "edges": e} else: return {"nodes": l_nodes, "edges": l_edges} - if article is not None: # Extracting the graph from the article. graphdict1 = graph_extract_article_author_affiliation(article) @@ -315,7 +317,6 @@ def graph_extractor_all_entity( return {"nodes": l_nodes, "edges": l_edges} - def check_upper_term(n: dict, text: str): """ It takes a node and a string as input, and if the node's name contains the string, it returns a new @@ -367,5 +368,4 @@ def check_upper_term(n: dict, text: str): # graphdict = {"nodes": l_nodes, "edges": l_edges} # data= json.dumps(graphdict, indent=4) # with open("bcancer-all.json", "w") as outfile: - # outfile.write(data) - + # outfile.write(data) diff --git a/triplea/service/graph/extract/country_based_co_authorship.py b/triplea/service/graph/extract/country_based_co_authorship.py index d2c4e9e..004a37a 100644 --- a/triplea/service/graph/extract/country_based_co_authorship.py +++ b/triplea/service/graph/extract/country_based_co_authorship.py @@ -1,7 +1,10 @@ from triplea.schemas.article import Article from triplea.schemas.node import Edge, Node + # from triplea.service.graph.extract import Emmanuel -from triplea.service.repository.state.custom.affiliation_mining import get_structured_affiliation +from triplea.service.repository.state.custom.affiliation_mining import ( + get_structured_affiliation, +) def graph_extract_article_country(article: Article) -> dict: @@ -14,16 +17,17 @@ def graph_extract_article_country(article: Article) -> dict: nodes.append(node_article.dict()) affiliation_list = get_structured_affiliation(article) - + # affiliation_list = Emmanuel(affiliation_list) - affiliation_list = [i for n, i in enumerate(affiliation_list) if i not in affiliation_list[n + 1 :]] - - + affiliation_list = [ + i for n, i in enumerate(affiliation_list) if i not in affiliation_list[n + 1:] + ] + for af in affiliation_list: - if 'country' in af: + if "country" in af: node_country = Node() - node_country.Identifier = af['country'] - node_country.Name = af['country'] + node_country.Identifier = af["country"] + node_country.Name = af["country"] node_country.Type = "Country" nodes.append(node_country.dict()) @@ -31,11 +35,7 @@ def graph_extract_article_country(article: Article) -> dict: edge.SourceID = node_article.Identifier edge.DestinationID = node_country.Identifier edge.Type = "IS" - edge.HashID = str( - hash(edge.SourceID + edge.DestinationID + edge.Type) - ) + edge.HashID = str(hash(edge.SourceID + edge.DestinationID + edge.Type)) edges.append(edge.dict()) - - return {"nodes": nodes, "edges": edges} - + return {"nodes": nodes, "edges": edges} diff --git a/triplea/service/graph/extract/topic.py b/triplea/service/graph/extract/topic.py index 039a10a..3a96f0b 100644 --- a/triplea/service/graph/extract/topic.py +++ b/triplea/service/graph/extract/topic.py @@ -1,5 +1,6 @@ from triplea.schemas.article import Article from triplea.schemas.node import Edge, Node + # import spacy # import pytextrank from triplea.service.click_logger import logger @@ -26,12 +27,11 @@ def graph_extract_article_topic(article: Article) -> dict: node_article.Type = "Article" nodes.append(node_article.dict()) - if article.Topics is not None: for t in article.Topics: node_topic = Node() - node_topic.Identifier = t['text'].lower() - node_topic.Name = t['text'].lower() + node_topic.Identifier = t["text"].lower() + node_topic.Name = t["text"].lower() node_topic.Type = "Topic" nodes.append(node_topic.dict()) @@ -39,7 +39,7 @@ def graph_extract_article_topic(article: Article) -> dict: edge.SourceID = node_article.Identifier edge.DestinationID = node_topic.Identifier edge.Type = "TOPIC" - edge.Weight = t['rank'] + edge.Weight = t["rank"] edge.HashID = str(hash(edge.SourceID + edge.DestinationID + edge.Type)) edges.append(edge.dict()) else: diff --git a/triplea/service/nlp/location_extract.py b/triplea/service/nlp/location_extract.py index c68d04c..d3d62e9 100644 --- a/triplea/service/nlp/location_extract.py +++ b/triplea/service/nlp/location_extract.py @@ -8,9 +8,9 @@ city = match.group(1) state = match.group(2) country = match.group(3) - + print(f"City: {city}") print(f"State: {state}") print(f"Country: {country}") else: - print("No location information found in the affiliation.") \ No newline at end of file + print("No location information found in the affiliation.") diff --git a/triplea/service/nlp/test.py b/triplea/service/nlp/test.py index 6612f55..ad661ab 100644 --- a/triplea/service/nlp/test.py +++ b/triplea/service/nlp/test.py @@ -28,4 +28,4 @@ # if subject and object: # triples.append((subject[0], subtree[0], object[0])) # for triple in triples: -# print("Triple:", (triple[0].text, triple[1].text, triple[2].text)) \ No newline at end of file +# print("Triple:", (triple[0].text, triple[1].text, triple[2].text)) diff --git a/triplea/service/nlp/test1.py b/triplea/service/nlp/test1.py index 93ee934..f3e2f89 100644 --- a/triplea/service/nlp/test1.py +++ b/triplea/service/nlp/test1.py @@ -7,7 +7,7 @@ # def extract_triples(text): # triples = [] # doc = nlp(text) - + # for sentence in doc.sents: # for token in sentence: # if "subj" in token.dep_: @@ -15,11 +15,11 @@ # predicate = token.head.text # object_ = token.head.head.text # triples.append((subject, predicate, object_)) - + # return triples # if __name__ == "__main__": # text = "Steve Jobs was the co-founder of Apple Inc." # triples = extract_triples(text) # for triple in triples: -# print(triple) \ No newline at end of file +# print(triple) diff --git a/triplea/service/nlp/test2.py b/triplea/service/nlp/test2.py index 6ff5dd8..bab55ab 100644 --- a/triplea/service/nlp/test2.py +++ b/triplea/service/nlp/test2.py @@ -24,4 +24,4 @@ # print(f" - children : {child.text} ({child.dep_})") # if "obj" in child.dep_: # object_ = child.text -# # triples.append((subject, predicate, object_)) \ No newline at end of file +# # triples.append((subject, predicate, object_)) diff --git a/triplea/service/nlp/test3.py b/triplea/service/nlp/test3.py index 7b61523..3eecc1f 100644 --- a/triplea/service/nlp/test3.py +++ b/triplea/service/nlp/test3.py @@ -22,9 +22,8 @@ # # if object: # # triples.append((subject, predicate, object)) - -# # return triples +# # return triples # if __name__ == "__main__": @@ -54,4 +53,3 @@ # object = child.text # if object: # triples.append((subject, predicate, object)) - diff --git a/triplea/service/nlp/triple_extract.py b/triplea/service/nlp/triple_extract.py index be12062..64098ec 100644 --- a/triplea/service/nlp/triple_extract.py +++ b/triplea/service/nlp/triple_extract.py @@ -30,7 +30,7 @@ # nlp = English() # #nlp.add_pipe(nlp.create_pipe('sentencizer')) # nlp.add_pipe('sentencizer') - + # document = nlp(text) # return [sent.text.strip() for sent in document.sents] @@ -89,7 +89,7 @@ # if tokens[token.i + 1].dep_ == "aux": # relation = appendChunk(relation, tokens[token.i + 1].lemma_) # relation = appendChunk(relation, tokens[token.i + 2].lemma_) - + # firstRelation = True # if isConstructionCandidate(token): # pLog("isConstructionCandidate") @@ -116,7 +116,7 @@ # pLog("Inja") # subject = tokens[i-1].text + subject # if "compound" in tokens[i-1].dep_: -# subject = tokens[i-1].text + subject +# subject = tokens[i-1].text + subject # pLog("subject tokens[i-1].text (" + tokens[i-1].text + ") ----------------> " + subject) # firstSubject = True @@ -149,7 +149,6 @@ # return triples - # def extract_triples(text:str): # sentences = getSentences(text) @@ -165,9 +164,9 @@ # dict['sentence'] = sentence # kg.append(dict) - + # return kg # if __name__ == "__main__": # text = "SpaCy is a powerful natural language processing library." -# extract_triples(text) \ No newline at end of file +# extract_triples(text) diff --git a/triplea/service/repository/convert_db.py b/triplea/service/repository/convert_db.py index 00915b8..168e5b7 100644 --- a/triplea/service/repository/convert_db.py +++ b/triplea/service/repository/convert_db.py @@ -23,9 +23,7 @@ def convert_arepo_mongodb_to_tinydb( destination_db = DB_TinyDB() - bar = click.progressbar(length=article_count, - show_pos=True, - show_percent=True) + bar = click.progressbar(length=article_count, show_pos=True, show_percent=True) n = 0 for id in l_pmid: a = source_db.get_article_by_pmid(id) diff --git a/triplea/service/repository/export/__init__.py b/triplea/service/repository/export/__init__.py index fd67098..4e5235a 100644 --- a/triplea/service/repository/export/__init__.py +++ b/triplea/service/repository/export/__init__.py @@ -5,10 +5,11 @@ export_triplea_csvs_in_relational_mode_save_file, ) from triplea.service.repository.export.llm import export_pretrain_llm_in_dir + __all__ = [ "export_rayyan_csv", "export_triplea_json", "export_pretrain_llm_in_dir", "export_triplea_csv", "export_triplea_csvs_in_relational_mode_save_file", -] \ No newline at end of file +] diff --git a/triplea/service/repository/export/llm.py b/triplea/service/repository/export/llm.py index 45d7271..54052a0 100644 --- a/triplea/service/repository/export/llm.py +++ b/triplea/service/repository/export/llm.py @@ -4,15 +4,16 @@ from triplea.schemas.article import Article import triplea.service.repository.persist as persist from triplea.service.click_logger import logger -from triplea.config.settings import ROOT -def export_pretrain_llm_in_dir(output_directory, Merge=True, proccess_bar=False, limit_sample=0,state = None): +def export_pretrain_llm_in_dir( + output_directory, Merge=True, proccess_bar=False, limit_sample=0, state=None +): """ The function `export_pretrain_llm_in_dir` exports text data from articles in a given directory, either merging them into a single file or saving each article separately. - + :param output_directory: The directory where the exported files will be saved :param Merge: The `Merge` parameter determines whether to merge all the extracted text into a single file (`True`) or save each article's text in a @@ -34,7 +35,9 @@ def export_pretrain_llm_in_dir(output_directory, Merge=True, proccess_bar=False, logger.INFO(f"{str(len(l_pmid))} Article(s) in Article Repository") else: l_pmid = persist.get_article_pmid_list_by_state(state) - logger.INFO(f"{str(len(l_pmid))} Article(s) is in state {str(state)} in Article Repository") + logger.INFO( + f"{str(len(l_pmid))} Article(s) is in state {str(state)} in Article Repository" + ) n = 0 if proccess_bar: @@ -70,37 +73,33 @@ def export_pretrain_llm_in_dir(output_directory, Merge=True, proccess_bar=False, if limit_sample != 0: # Unlimited if n > limit_sample: break - text_extract = "" - a_title="" - a_abstract ="" + a_title = "" + a_abstract = "" if article is not None: if article.Title is not None: - a_title=article.Title + a_title = article.Title if article.Abstract is not None: a_abstract = article.Abstract - text_extract = a_title + ' ' + a_abstract + text_extract = a_title + " " + a_abstract if Merge: Merge_text_extract = Merge_text_extract + text_extract - else: # Merge = false + else: # Merge = false file_path = os.path.join(output_directory, f"{article.PMID}.txt") - f = open(file_path, "w", encoding='utf-8') + f = open(file_path, "w", encoding="utf-8") f.write(text_extract) f.close() if proccess_bar: bar.label = f"Article ({n}) (PMID : {article.PMID}): Saved." - if Merge: file_path = os.path.join(output_directory, "pretrain_llm.txt") - f = open(file_path, "w", encoding='utf-8') + f = open(file_path, "w", encoding="utf-8") f.write(Merge_text_extract) f.close() print() logger.INFO("Task Done.") - - diff --git a/triplea/service/repository/export/rayyan_format.py b/triplea/service/repository/export/rayyan_format.py index de834a1..1fface4 100644 --- a/triplea/service/repository/export/rayyan_format.py +++ b/triplea/service/repository/export/rayyan_format.py @@ -1,4 +1,3 @@ -import json import sys from triplea.service.click_logger import logger from triplea.schemas.article import Article @@ -6,7 +5,6 @@ import traceback - """ This function exports article data from a database to a CSV file in a specific format. @@ -30,7 +28,9 @@ 7. Format the fields and append them to the CSV data. 8. Return the CSV data. """ -def export_rayyan_csv()-> str: + + +def export_rayyan_csv() -> str: l_pmid = persist.get_all_article_pmid_list() logger.DEBUG(f"{str(len(l_pmid))} Article(s) Selected.") @@ -38,10 +38,12 @@ def export_rayyan_csv()-> str: number_of_article_move_forward = 0 refresh_point = 0 - nodes = [] - edges = [] csv = "" - csv = csv + """key,title,authors,issn,volume,issue,pages,year,publisher,url,abstract,notes,doi,keywords""" + "\n" + csv = ( + csv + + """key,title,authors,issn,volume,issue,pages,year,publisher,url,abstract,notes,doi,keywords""" + + "\n" + ) n = 0 for id in l_pmid: try: @@ -53,7 +55,7 @@ def export_rayyan_csv()-> str: f"There are {str(total_article_in_current_state - number_of_article_move_forward)} article(s) left ... ", forecolore="yellow", ) - + else: refresh_point = refresh_point + 1 @@ -64,53 +66,55 @@ def export_rayyan_csv()-> str: print() print(logger.ERROR(f"Error in parsing article. PMID = {id}")) raise Exception("Article Not Parsed.") - #------------------Select ---------------- - n=n+1 - + # ------------------Select ---------------- + n = n + 1 if updated_article.Title.__contains__(","): - title = updated_article.Title.replace('"', ' ') - title = f'"{title}"' + title = updated_article.Title.replace('"', " ") + title = f'"{title}"' else: title = updated_article.Title - + authors = "" issn = "" volume = "" issue = "" pages = "" try: - year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['ArticleDate']['Year'] + year = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["Article"]["ArticleDate"]["Year"] except: try: - year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['DateCompleted']['Year'] + year = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["Article"]["DateCompleted"]["Year"] except: try: - year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['DateCompleted']['Year'] - except: + year = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["DateCompleted"]["Year"] + except: year = "0" - # with open("sample.json", "w") as outfile: # json.dump(updated_article.OreginalArticle, outfile) - publisher = "" - url= f"https://pubmed.ncbi.nlm.nih.gov/{updated_article.PMID}/" + url = f"https://pubmed.ncbi.nlm.nih.gov/{updated_article.PMID}/" if updated_article.Abstract is None: abstract = "" - else: + else: if updated_article.Abstract.__contains__(","): - abstract = updated_article.Abstract.replace('"', ' ') - abstract = f'"{abstract}"' + abstract = updated_article.Abstract.replace('"', " ") + abstract = f'"{abstract}"' else: abstract = updated_article.Abstract notes = "" doi = "" keywords = "" - for au in updated_article.Authors: authors = authors + au.FullName + "," @@ -119,23 +123,25 @@ def export_rayyan_csv()-> str: for k in updated_article.Keywords: keywords = keywords + k.Text + ";" - + if keywords != "": if keywords.__contains__(","): keywords = f'"{keywords[:-1]}"' + csv = ( + csv + + f"""{n},{title},{authors},{issn},{volume},{issue},{pages},{year},{publisher},{url},{abstract},{notes},{doi},{keywords}""" + + "\n" + ) - csv = csv + f"""{n},{title},{authors},{issn},{volume},{issue},{pages},{year},{publisher},{url},{abstract},{notes},{doi},{keywords}""" + "\n" - - - #------------------Select ---------------- + # ------------------Select ---------------- except Exception: - exc_type, exc_value, exc_tb = sys.exc_info() - print() - print(exc_tb.tb_lineno) - logger.ERROR(f"Error {exc_type}") - logger.ERROR(f"Error {exc_value}") - traceback.print_tb(exc_tb) + exc_type, exc_value, exc_tb = sys.exc_info() + print() + print(exc_tb.tb_lineno) + logger.ERROR(f"Error {exc_type}") + logger.ERROR(f"Error {exc_value}") + traceback.print_tb(exc_tb) # print(os.path.join('/path/to/Documents',"completeName")) @@ -143,4 +149,3 @@ def export_rayyan_csv()-> str: # file1.write(csv) logger.INFO("Export Complete.") return csv - diff --git a/triplea/service/repository/export/save_article.py b/triplea/service/repository/export/save_article.py index c0f0c4a..d60eea9 100644 --- a/triplea/service/repository/export/save_article.py +++ b/triplea/service/repository/export/save_article.py @@ -1,6 +1,3 @@ - - - import json from bson import ObjectId @@ -13,16 +10,18 @@ def default(self, obj): return str(obj) return super().default(obj) -def save_article2json(article:Article, output_file:str): + +def save_article2json(article: Article, output_file: str): article_json = json.loads( - json.dumps(article, default=lambda o: o.__dict__, sort_keys=True, indent=4) - ) - f = open(output_file, "w", encoding='utf-8') + json.dumps(article, default=lambda o: o.__dict__, sort_keys=True, indent=4) + ) + f = open(output_file, "w", encoding="utf-8") f.write(article_json) f.close() -def save_articlestr2json(article:dict, output_file:str): - json_object = json.dumps(article, cls=JSONEncoder, indent = 4) - f = open(output_file, "w", encoding='utf-8') + +def save_articlestr2json(article: dict, output_file: str): + json_object = json.dumps(article, cls=JSONEncoder, indent=4) + f = open(output_file, "w", encoding="utf-8") f.write(json_object) - f.close() \ No newline at end of file + f.close() diff --git a/triplea/service/repository/export/triplea_format.py b/triplea/service/repository/export/triplea_format.py index 4fdb837..074a2d3 100644 --- a/triplea/service/repository/export/triplea_format.py +++ b/triplea/service/repository/export/triplea_format.py @@ -11,7 +11,7 @@ from triplea.utils.general import safe_csv -def export_triplea_json(proccess_bar=False, limit_sample=0)-> str: +def export_triplea_json(proccess_bar=False, limit_sample=0) -> str: l_pmid = persist.get_all_article_pmid_list() logger.DEBUG(f"{str(len(l_pmid))} Article(s) Selected.") @@ -44,42 +44,43 @@ def export_triplea_json(proccess_bar=False, limit_sample=0)-> str: print() print(logger.ERROR(f"Error in parsing article. PMID = {id}")) raise Exception("Article Not Parsed.") - #------------------Select ---------------- - output.append (updated_article) + # ------------------Select ---------------- + output.append(updated_article) if proccess_bar: - bar.label = ( - "Article " - + id - + " exported. " - ) - bar.update(1) - if limit_sample == 0: # unlimited + bar.label = "Article " + id + " exported. " + bar.update(1) + if limit_sample == 0: # unlimited pass else: if n > limit_sample: break - #------------------Select ---------------- + # ------------------Select ---------------- except Exception: - exc_type, exc_value, exc_tb = sys.exc_info() - print() - print(exc_tb.tb_lineno) - logger.ERROR(f"Error {exc_type}") - logger.ERROR(f"Error {exc_value}") - traceback.print_tb(exc_tb) + exc_type, exc_value, exc_tb = sys.exc_info() + print() + print(exc_tb.tb_lineno) + logger.ERROR(f"Error {exc_type}") + logger.ERROR(f"Error {exc_value}") + traceback.print_tb(exc_tb) final = json.dumps(output, default=lambda o: o.__dict__, indent=2) print() logger.INFO("Export Complete.") return final -def export_triplea_csv(proccess_bar=False, limit_sample=0)-> str: + +def export_triplea_csv(proccess_bar=False, limit_sample=0) -> str: l_pmid = persist.get_all_article_pmid_list() logger.DEBUG(f"{str(len(l_pmid))} Article(s) Selected.") total_article_in_current_state = len(l_pmid) refresh_point = 0 csv = "" - csv = csv + """key,title,authors,pmid,year,publisher,url,abstract,state,doi,keywords,topics""" + "\n" + csv = ( + csv + + """key,title,authors,pmid,year,publisher,url,abstract,state,doi,keywords,topics""" + + "\n" + ) n = 0 for id in l_pmid: try: @@ -101,50 +102,52 @@ def export_triplea_csv(proccess_bar=False, limit_sample=0)-> str: print() print(logger.ERROR(f"Error in parsing article. PMID = {id}")) raise Exception("Article Not Parsed.") - #------------------Select ---------------- - + # ------------------Select ---------------- if updated_article.Title.__contains__(","): - title = updated_article.Title.replace('"', ' ') - title = f'"{title}"' + title = updated_article.Title.replace('"', " ") + title = f'"{title}"' else: title = updated_article.Title - + authors = "" try: - year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['ArticleDate']['Year'] + year = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["Article"]["ArticleDate"]["Year"] except: try: - year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['DateCompleted']['Year'] + year = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["Article"]["DateCompleted"]["Year"] except: try: - year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['DateCompleted']['Year'] - except: + year = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["DateCompleted"]["Year"] + except: year = "0" - # with open("sample.json", "w") as outfile: # json.dump(updated_article.OreginalArticle, outfile) - publisher = updated_article.Journal - url= f"https://pubmed.ncbi.nlm.nih.gov/{updated_article.PMID}/" + url = f"https://pubmed.ncbi.nlm.nih.gov/{updated_article.PMID}/" if updated_article.Abstract is None: abstract = "" - else: + else: if updated_article.Abstract.__contains__(","): - abstract = updated_article.Abstract.replace('"', ' ') - abstract = f'"{abstract}"' + abstract = updated_article.Abstract.replace('"', " ") + abstract = f'"{abstract}"' else: abstract = updated_article.Abstract - notes = "" + doi = updated_article.DOI pmid = updated_article.PMID state = updated_article.State keywords = "" - for au in updated_article.Authors: authors = authors + au.FullName + "," @@ -153,70 +156,75 @@ def export_triplea_csv(proccess_bar=False, limit_sample=0)-> str: for k in updated_article.Keywords: keywords = keywords + k.Text + ";" - + if keywords != "": if keywords.__contains__(","): keywords = f'"{keywords[:-1]}"' for topic in updated_article.Topics: topics = topics + topic + ";" - + if topics != "": if topics.__contains__(","): topics = f'"{topics[:-1]}"' + csv = ( + csv + + f"""{n},{title},{authors},{pmid},{year},{publisher},{url},{abstract},{state},{doi},{keywords},{topics}""" + + "\n" + ) - csv = csv + f"""{n},{title},{authors},{pmid},{year},{publisher},{url},{abstract},{state},{doi},{keywords},{topics}""" + "\n" - - - #------------------Select ---------------- + # ------------------Select ---------------- except Exception: - exc_type, exc_value, exc_tb = sys.exc_info() - print() - print(exc_tb.tb_lineno) - logger.ERROR(f"Error {exc_type}") - logger.ERROR(f"Error {exc_value}") - traceback.print_tb(exc_tb) + exc_type, exc_value, exc_tb = sys.exc_info() + print() + print(exc_tb.tb_lineno) + logger.ERROR(f"Error {exc_type}") + logger.ERROR(f"Error {exc_value}") + traceback.print_tb(exc_tb) logger.INFO("Export Complete.") return csv -def export_triplea_csvs_in_relational_mode_save_file(output_file:str, - proccess_bar=True, - limit_sample=0): - + +def export_triplea_csvs_in_relational_mode_save_file( + output_file: str, proccess_bar=True, limit_sample=0 +): l_pmid = persist.get_all_article_pmid_list() logger.DEBUG(f"{str(len(l_pmid))} Article(s) Selected.") total_article_in_current_state = len(l_pmid) - - bar = click.progressbar(length=len(l_pmid), - show_pos=True, - show_percent=True) + bar = click.progressbar(length=len(l_pmid), show_pos=True, show_percent=True) max_refresh_point = 500 refresh_point = 0 csv = "" - authors_csv = "key,authors,affiliations,country,university,institute,center,hospital,department" + "\n" + authors_csv = ( + "key,authors,affiliations,country,university,institute,center,hospital,department,location,email,zipcode" + + "\n" + ) keywords_csv = "key,keywords" + "\n" - topics_csv="key,topics,rank" + "\n" - csv = csv + """key,title,pmid,year,publisher,url,abstract,state,doi,journal_issn,journal_iso_abbreviation,language,publication_type,citation""" + "\n" + topics_csv = "key,topics,rank" + "\n" + csv = ( + csv + + """key,title,pmid,year,publisher,url,abstract,state,doi,journal_issn,journal_iso_abbreviation,language,publication_type,citation""" + + "\n" + ) n = 0 # -------------------Create File------------------------------- file_name = os.path.basename(output_file) file = os.path.splitext(file_name) - fname = file[0] + fname = file[0] fextention = file[1] - - dir = output_file.replace(fname + fextention, '') + dir = output_file.replace(fname + fextention, "") if fextention is None: - fextention= '.csv' + fextention = ".csv" main_file = os.path.join(dir, fname + fextention) - authors_file = os.path.join(dir, fname + '_authors' + fextention) - keywords_file = os.path.join(dir, fname + '_keywords' + fextention) - topics_file = os.path.join(dir, fname + '_topics' + fextention) + authors_file = os.path.join(dir, fname + "_authors" + fextention) + keywords_file = os.path.join(dir, fname + "_keywords" + fextention) + topics_file = os.path.join(dir, fname + "_topics" + fextention) with open(main_file, "w", encoding="utf-8") as file1: file1.write(csv) @@ -234,9 +242,9 @@ def export_triplea_csvs_in_relational_mode_save_file(output_file:str, file4.write(topics_csv) topics_csv = "" - f_main = open(main_file, 'a', encoding="utf-8") - f_authors = open(authors_file, 'a', encoding="utf-8") - f_keywords = open(keywords_file, 'a', encoding="utf-8") + f_main = open(main_file, "a", encoding="utf-8") + f_authors = open(authors_file, "a", encoding="utf-8") + f_keywords = open(keywords_file, "a", encoding="utf-8") f_topics = open(topics_file, "a", encoding="utf-8") for id in l_pmid: @@ -251,10 +259,8 @@ def export_triplea_csvs_in_relational_mode_save_file(output_file:str, f"There are {str(total_article_in_current_state - n)} article(s) left ", forecolore="yellow", ) - if proccess_bar == False: - bar.label = ( - f"There are {str(total_article_in_current_state - n)} article(s) left " - ) + if proccess_bar is False: + bar.label = f"There are {str(total_article_in_current_state - n)} article(s) left " bar.update(max_refresh_point) else: refresh_point = refresh_point + 1 @@ -263,10 +269,9 @@ def export_triplea_csvs_in_relational_mode_save_file(output_file:str, if n > limit_sample: break - a = persist.get_article_by_pmid(id) # a = persist.get_article_by_pmid('18194356') # CRITICAL - + try: updated_article = Article(**a.copy()) except Exception: @@ -274,9 +279,6 @@ def export_triplea_csvs_in_relational_mode_save_file(output_file:str, print(logger.ERROR(f"Error in parsing article. PMID = {id}")) raise Exception("Article Not Parsed.") - - - title = "" year = "" publisher = "" @@ -285,7 +287,6 @@ def export_triplea_csvs_in_relational_mode_save_file(output_file:str, language = "" publication_type = "" - if updated_article.Title is not None: title = safe_csv(updated_article.Title) @@ -293,70 +294,89 @@ def export_triplea_csvs_in_relational_mode_save_file(output_file:str, # year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['ArticleDate']['Year'] # except: # try: - # year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['DateCompleted']['Year'] + # year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['DateCompleted']['Year'] # except: # try: - # year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['DateCompleted']['Year'] - # except: + # year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['DateCompleted']['Year'] + # except: # year = "0" # # with open("sample.json", "w") as outfile: # # json.dump(updated_article.OreginalArticle, outfile) - try: - year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'] + year = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["Article"]["Journal"]["JournalIssue"]["PubDate"][ + "Year" + ] except: try: - year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['MedlineDate'] + year = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["Article"]["Journal"]["JournalIssue"][ + "PubDate" + ][ + "MedlineDate" + ] except: year = "0" # with open("sample.json", "w") as outfile: - # json.dump(updated_article.OreginalArticle, outfile) + # json.dump(updated_article.OreginalArticle, outfile) publisher = safe_csv(updated_article.Journal) try: - journal_issn = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Journal']['ISSN']['#text'] + journal_issn = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["Article"]["Journal"]["ISSN"]["#text"] except: journal_issn = "" - - journal_iso_abbreviation = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Journal']['ISOAbbreviation'] - lang = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Language'] - if isinstance(lang,list): + journal_iso_abbreviation = updated_article.OreginalArticle[ + "PubmedArticleSet" + ]["PubmedArticle"]["MedlineCitation"]["Article"]["Journal"][ + "ISOAbbreviation" + ] + lang = updated_article.OreginalArticle["PubmedArticleSet"]["PubmedArticle"][ + "MedlineCitation" + ]["Article"]["Language"] + if isinstance(lang, list): for l in lang: - language = l + ', ' + language + language = l + ", " + language language = language[:-1] else: language = lang language = safe_csv(language) - - - - p = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['PublicationTypeList']['PublicationType'] - if isinstance(p,list): + p = updated_article.OreginalArticle["PubmedArticleSet"]["PubmedArticle"][ + "MedlineCitation" + ]["Article"]["PublicationTypeList"]["PublicationType"] + if isinstance(p, list): for i in p: - chunk = i['#text'] - publication_type = chunk + ', ' + publication_type + chunk = i["#text"] + publication_type = chunk + ", " + publication_type # publication_type = p[0]['#text'] publication_type = publication_type[:-1] else: - publication_type = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['PublicationTypeList']['PublicationType']['#text'] - + publication_type = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["Article"]["PublicationTypeList"][ + "PublicationType" + ][ + "#text" + ] + journal_iso_abbreviation = safe_csv(journal_iso_abbreviation) - - publication_type = safe_csv(publication_type) - + publication_type = safe_csv(publication_type) - url= f"https://pubmed.ncbi.nlm.nih.gov/{updated_article.PMID}/" + url = f"https://pubmed.ncbi.nlm.nih.gov/{updated_article.PMID}/" if updated_article.Abstract is None: abstract = "" - else: + else: if updated_article.Abstract.__contains__(","): - abstract = updated_article.Abstract.replace('"', ' ') - abstract = f'"{abstract}"' + abstract = updated_article.Abstract.replace('"', " ") + abstract = f'"{abstract}"' else: abstract = updated_article.Abstract doi = updated_article.DOI @@ -367,92 +387,114 @@ def export_triplea_csvs_in_relational_mode_save_file(output_file:str, if updated_article.CitedBy is not None: citation = len(updated_article.CitedBy) - - - if updated_article.Authors is not None: for au in updated_article.Authors: if au.Affiliations is not None: first_aff = au.Affiliations[0] department = "" hospital = "" - institute ="" + institute = "" country = "" - university="" - center = "" - if first_aff.Structural is not None: + university = "" + center = "" + + location = "" + email = "" + zipcode = "" + + if first_aff.Structural is not None: for s in first_aff.Structural: - if 'department' in s: - department = s['department'] - elif 'hospital' in s: - hospital = s['hospital'] - elif 'institute' in s: - institute = s['institute'] - elif 'country' in s: - country = s['country'] - elif 'university' in s: - university = s['university'] - elif 'center' in s: - center = s['center'] + if "department" in s: + department = s["department"] + elif "hospital" in s: + hospital = s["hospital"] + elif "institute" in s: + institute = s["institute"] + elif ( + "institution" in s + ): # aff.ParseMethod = AffiliationParseMethod.TITIPATA_API + institute = s["institution"] + elif "country" in s: + country = s["country"] + elif "university" in s: + university = s["university"] + elif "center" in s: + center = s["center"] + + elif ( + "location" in s + ): # aff.ParseMethod = AffiliationParseMethod.TITIPATA_API + location = s["location"] + elif ( + "email" in s + ): # aff.ParseMethod = AffiliationParseMethod.TITIPATA_API + email = s["email"] + elif ( + "zipcode" in s + ): # aff.ParseMethod = AffiliationParseMethod.TITIPATA_API + zipcode = s["zipcode"] + else: print(s) aff = first_aff.Text else: aff = None - - str_aff = f"{safe_csv(country)},{safe_csv(university)},{safe_csv(institute)},{safe_csv(center)},{safe_csv(hospital)},{safe_csv(department)}" - authors_csv = authors_csv + f"{n},{safe_csv(au.FullName)},{safe_csv(aff)},{str_aff}" + "\n" + + str_aff = f"{safe_csv(country)},{safe_csv(university)},{safe_csv(institute)},{safe_csv(center)},{safe_csv(hospital)},{safe_csv(department)},{safe_csv(location)},{safe_csv(email)},{safe_csv(zipcode)}" + authors_csv = ( + authors_csv + + f"{n},{safe_csv(au.FullName)},{safe_csv(aff)},{str_aff}" + + "\n" + ) if updated_article.Keywords is not None: for k in updated_article.Keywords: if k is not None: keywords_csv = keywords_csv + f"{n},{safe_csv(k.Text)}" + "\n" - + if updated_article.Topics is not None: for topic in updated_article.Topics: if topic is not None: - topics_csv = topics_csv + f"{n},{safe_csv(topic['text'])},{topic['rank']}" + "\n" - - - csv = csv + f"""{n},{title},{pmid},{year},{publisher},{url},{abstract},{state},{doi},{journal_issn},{journal_iso_abbreviation},{language},{publication_type},{citation}""" + "\n" - + topics_csv = ( + topics_csv + + f"{n},{safe_csv(topic['text'])},{topic['rank']}" + + "\n" + ) + + csv = ( + csv + + f"""{n},{title},{pmid},{year},{publisher},{url},{abstract},{state},{doi},{journal_issn},{journal_iso_abbreviation},{language},{publication_type},{citation}""" + + "\n" + ) if proccess_bar: - bar.label = ( - "Article " - + updated_article.PMID - + " , exported." - ) + bar.label = "Article " + updated_article.PMID + " , exported." bar.update(1) - #------------------Write to file ---------------- + # ------------------Write to file ---------------- f_main.write(csv) csv = "" - f_authors.write(authors_csv) authors_csv = "" - f_keywords.write(keywords_csv) keywords_csv = "" - f_topics.write(topics_csv) topics_csv = "" except Exception: - exc_type, exc_value, exc_tb = sys.exc_info() - print() - print(f"line : {exc_tb.tb_lineno}") - print(f"PMID : {updated_article.PMID}") - logger.ERROR(f"Error {exc_type}") - logger.ERROR(f"Error {exc_value}") - traceback.print_tb(exc_tb) + exc_type, exc_value, exc_tb = sys.exc_info() + print() + print(f"line : {exc_tb.tb_lineno}") + print(f"PMID : {updated_article.PMID}") + logger.ERROR(f"Error {exc_type}") + logger.ERROR(f"Error {exc_value}") + traceback.print_tb(exc_tb) f_main.close() f_authors.close() f_keywords.close() f_topics.close() logger.INFO("Export Complete.") - diff --git a/triplea/service/repository/import_file/__init__.py b/triplea/service/repository/import_file/__init__.py index c22405f..69dae8f 100644 --- a/triplea/service/repository/import_file/__init__.py +++ b/triplea/service/repository/import_file/__init__.py @@ -1,7 +1,9 @@ -from triplea.service.repository.import_file.bib import get_article_from_bibliography_file_format +from triplea.service.repository.import_file.bib import ( + get_article_from_bibliography_file_format, +) from triplea.service.repository.import_file.triplea import import_triplea_json __all__ = [ "get_article_from_bibliography_file_format", "import_triplea_json", -] \ No newline at end of file +] diff --git a/triplea/service/repository/import_file/bib.py b/triplea/service/repository/import_file/bib.py index 4bcb1ba..52023de 100644 --- a/triplea/service/repository/import_file/bib.py +++ b/triplea/service/repository/import_file/bib.py @@ -73,4 +73,3 @@ def get_article_from_bibliography_file_format(filepath: str): logger.INFO("The article was registered in Arepo.") return True - diff --git a/triplea/service/repository/import_file/triplea.py b/triplea/service/repository/import_file/triplea.py index 65b5748..38efd80 100644 --- a/triplea/service/repository/import_file/triplea.py +++ b/triplea/service/repository/import_file/triplea.py @@ -1,9 +1,9 @@ - import json import click import triplea.service.repository.persist as persist from triplea.service.click_logger import logger + def import_triplea_json(filename, proccess_bar=False): """ Imports data from a JSON file and creates articles in a repository using the imported data. @@ -28,14 +28,8 @@ def import_triplea_json(filename, proccess_bar=False): for a in data: persist.create_article(a) if proccess_bar: - bar.label = ( - "Article " - + a['PMID'] - + " write in article repo. " - ) + bar.label = "Article " + a["PMID"] + " write in article repo. " bar.update(1) print() persist.refresh() logger.INFO("Import Complete.") - - diff --git a/triplea/service/repository/persist.py b/triplea/service/repository/persist.py index 5b82959..8c1cb37 100644 --- a/triplea/service/repository/persist.py +++ b/triplea/service/repository/persist.py @@ -5,11 +5,7 @@ from triplea.service.click_logger import logger - - - - -#region Article +# region Article def create_article(article: Article): @@ -133,6 +129,7 @@ def get_all_article_count() -> int: """ return db.get_all_article_count() + def get_article_group_by_state(): """ It returns a list of dictionaries, each dictionary containing the state name and the number of @@ -141,17 +138,21 @@ def get_article_group_by_state(): """ return db.get_article_group_by_state() -#region Extra Article Method -def change_flag_extract_topic(current_value,set_value): - return db.change_flag_extract_topic(current_value,set_value) +# region Extra Article Method + + +def change_flag_extract_topic(current_value, set_value): + return db.change_flag_extract_topic(current_value, set_value) -#endregion +# endregion -#endregion -#region Node +# endregion + +# region Node + def create_node(node: Node) -> int: """ @@ -183,9 +184,11 @@ def get_all_nodes(): """ return db.get_all_nodes() -#endregion -#region Edge +# endregion + +# region Edge + def create_edge(edge: Edge) -> int: """ @@ -223,9 +226,10 @@ def get_all_edges(): return db.get_all_edges() -#endregion +# endregion + +# region Triple -#region Triple def create_triple(triple: dict) -> int: # Duplication is not checked in this method, @@ -233,7 +237,9 @@ def create_triple(triple: dict) -> int: return db.add_new_triple(triple) -#endregion + +# endregion + def refresh(): """ diff --git a/triplea/service/repository/pipeline_core.py b/triplea/service/repository/pipeline_core.py index 47b7faf..2a44540 100644 --- a/triplea/service/repository/pipeline_core.py +++ b/triplea/service/repository/pipeline_core.py @@ -99,12 +99,12 @@ def move_state_forward( if refresh_point == 500: refresh_point = 0 - # persist.refresh() - # print() - # logger.INFO( - # f"There are {str(total_article_in_current_state - number_of_article_move_forward)} article(s) left ", - # forecolore="yellow", - # ) + persist.refresh() + print() + logger.INFO( + f"There are {str(total_article_in_current_state - number_of_article_move_forward)} article(s) left ", + forecolore="yellow", + ) # min = ( # total_article_in_current_state - number_of_article_move_forward # ) / 60 @@ -117,7 +117,7 @@ def move_state_forward( a = persist.get_article_by_pmid(id) # a = persist.get_article_by_pmid('35970485') # CRITICAL For Test and Debug - + try: updated_article = Article(**a.copy()) except Exception: @@ -190,7 +190,6 @@ def move_state_forward( # else: # logger.ERROR('Duplication has Occurred') - except Exception: if current_state == 1: updated_article = Article(**a.copy()) @@ -220,7 +219,6 @@ def move_state_forward( logger.ERROR(f"Error {exc_type}") logger.ERROR(f"Error {exc_value}") - else: persist.refresh() exc_type, exc_value, exc_tb = sys.exc_info() diff --git a/triplea/service/repository/pipeline_flag.py b/triplea/service/repository/pipeline_flag.py index 89f7be0..342ab9e 100644 --- a/triplea/service/repository/pipeline_flag.py +++ b/triplea/service/repository/pipeline_flag.py @@ -11,7 +11,7 @@ def go_extract_triple(): online_bar = True max_refresh_point = 500 - l_pmid = persist.get_article_pmid_list_by_cstate( 0, "FlagExtractKG" ) + l_pmid = persist.get_article_pmid_list_by_cstate(0, "FlagExtractKG") total_article_in_current_state = len(l_pmid) number_of_article_move_forward = 0 logger.DEBUG(str(len(l_pmid)) + " Article(s) is in FlagExtractKG " + str(0)) @@ -33,10 +33,8 @@ def go_extract_triple(): f"There are {str(total_article_in_current_state - number_of_article_move_forward)} article(s) left ", forecolore="yellow", ) - if online_bar == False: - bar.label = ( - f"There are {str(total_article_in_current_state - number_of_article_move_forward)} article(s) left " - ) + if online_bar is False: + bar.label = f"There are {str(total_article_in_current_state - number_of_article_move_forward)} article(s) left " bar.update(max_refresh_point) else: refresh_point = refresh_point + 1 @@ -55,39 +53,41 @@ def go_extract_triple(): if online_bar: bar.label = ( - "Article " - + updated_article.PMID - + " Extract Knowledge Triple From Abstract" + "Article " + + updated_article.PMID + + " Extract Knowledge Triple From Abstract" ) bar.update(1) if current_state is None: - updated_article = state_manager.extract_triple_abstract_save(updated_article) - persist.update_article_by_pmid(updated_article, - updated_article.PMID) + updated_article = state_manager.extract_triple_abstract_save( + updated_article + ) + persist.update_article_by_pmid(updated_article, updated_article.PMID) - elif current_state == -1: - updated_article = state_manager.extract_triple_abstract_save(updated_article) - persist.update_article_by_pmid(updated_article, - updated_article.PMID) + elif current_state == -1: + updated_article = state_manager.extract_triple_abstract_save( + updated_article + ) + persist.update_article_by_pmid(updated_article, updated_article.PMID) - elif current_state == 0: - updated_article = state_manager.extract_triple_abstract_save(updated_article) - persist.update_article_by_pmid(updated_article, - updated_article.PMID) + elif current_state == 0: + updated_article = state_manager.extract_triple_abstract_save( + updated_article + ) + persist.update_article_by_pmid(updated_article, updated_article.PMID) - elif current_state == 1: + elif current_state == 1: pass else: raise NotImplementedError - + except Exception: if current_state == 0 or current_state is None: updated_article = Article(**a.copy()) updated_article.FlagExtractKG = 0 - persist.update_article_by_pmid(updated_article, - updated_article.PMID) + persist.update_article_by_pmid(updated_article, updated_article.PMID) persist.refresh() exc_type, exc_value, exc_tb = sys.exc_info() print() @@ -104,24 +104,22 @@ def go_extract_triple(): logger.ERROR(f"Error {exc_type}") logger.ERROR(f"Error {exc_value}") logger.ERROR(f"Error {exc_tb}") - persist.refresh() + persist.refresh() + def go_extract_topic(proccess_bar=True): max_refresh_point = 100 - l_pmid = persist.get_article_pmid_list_by_cstate( 0, "FlagExtractTopic" ) + l_pmid = persist.get_article_pmid_list_by_cstate(0, "FlagExtractTopic") total_article_in_current_state = len(l_pmid) n = 0 logger.DEBUG(str(len(l_pmid)) + " Article(s) is in FlagExtractTopic " + str(0)) if proccess_bar: - bar = click.progressbar(length=len(l_pmid), - show_pos=True, - show_percent=True) + bar = click.progressbar(length=len(l_pmid), show_pos=True, show_percent=True) refresh_point = 0 - elapsed = 0 + for id in l_pmid: - start_time = time.time() try: n = n + 1 current_state = None @@ -135,10 +133,8 @@ def go_extract_topic(proccess_bar=True): f"There are {str(total_article_in_current_state - n)} article(s) left ", forecolore="yellow", ) - if proccess_bar == False: - bar.label = ( - f"There are {str(total_article_in_current_state - n)} article(s) left " - ) + if proccess_bar is False: + bar.label = f"There are {str(total_article_in_current_state - n)} article(s) left " bar.update(max_refresh_point) else: refresh_point = refresh_point + 1 @@ -151,40 +147,29 @@ def go_extract_topic(proccess_bar=True): print(logger.ERROR(f"Error in parsing article. PMID = {id}")) raise Exception("Article Not Parsed.") try: - current_state = updated_article.FlagExtractTopic #------------ + current_state = updated_article.FlagExtractTopic # ------------ except Exception: current_state = 0 if proccess_bar: bar.label = ( - "Article " - + updated_article.PMID - + " , topic were extracted." + "Article " + updated_article.PMID + " , topic were extracted." ) bar.update(1) if current_state is None: - updated_article = state_manager.extract_topic_abstract( - updated_article - ) - persist.update_article_by_pmid(updated_article, - updated_article.PMID) + updated_article = state_manager.extract_topic_abstract(updated_article) + persist.update_article_by_pmid(updated_article, updated_article.PMID) - elif current_state == -1: - updated_article = state_manager.extract_topic_abstract( - updated_article - ) - persist.update_article_by_pmid(updated_article, - updated_article.PMID) + elif current_state == -1: + updated_article = state_manager.extract_topic_abstract(updated_article) + persist.update_article_by_pmid(updated_article, updated_article.PMID) - elif current_state == 0: - updated_article = state_manager.extract_topic_abstract( - updated_article - ) - persist.update_article_by_pmid(updated_article, - updated_article.PMID) + elif current_state == 0: + updated_article = state_manager.extract_topic_abstract(updated_article) + persist.update_article_by_pmid(updated_article, updated_article.PMID) - elif current_state == 1: + elif current_state == 1: pass else: @@ -194,8 +179,7 @@ def go_extract_topic(proccess_bar=True): if current_state == 0 or current_state is None: updated_article = Article(**a.copy()) updated_article.FlagExtractTopic = -1 - persist.update_article_by_pmid(updated_article, - updated_article.PMID) + persist.update_article_by_pmid(updated_article, updated_article.PMID) persist.refresh() exc_type, exc_value, exc_tb = sys.exc_info() print() @@ -209,12 +193,11 @@ def go_extract_topic(proccess_bar=True): print(exc_tb.tb_lineno) logger.ERROR(f"Error {exc_type}") logger.ERROR(f"Error {exc_value}") - elapsed = time.time() - start_time - persist.refresh() + persist.refresh() -def go_affiliation_mining(): - l_pmid = persist.get_article_pmid_list_by_cstate( 0, "FlagAffiliationMining" ) +def go_affiliation_mining(method: str = "Simple"): + l_pmid = persist.get_article_pmid_list_by_cstate(0, "FlagAffiliationMining") total_article_in_current_state = len(l_pmid) number_of_article_move_forward = 0 logger.DEBUG(str(len(l_pmid)) + " Article(s) is in FlagAffiliationMining " + str(0)) @@ -238,7 +221,8 @@ def go_affiliation_mining(): forecolore="yellow", ) min = ( - (total_article_in_current_state - number_of_article_move_forward) * elapsed + (total_article_in_current_state - number_of_article_move_forward) + * elapsed ) / 60 logger.INFO( f"It takes at least {str(int(min))} minutes or {str(int(min/60))} hours", @@ -260,31 +244,26 @@ def go_affiliation_mining(): current_state = 0 # logger.DEBUG('Article ' + updated_article.PMID + ' with state ' + str(current_state) + ' forward to ' + str(current_state + 1)) - bar.label = ( - "Article " - + updated_article.PMID - + " affiliation mining." - ) + bar.label = "Article " + updated_article.PMID + " affiliation mining." bar.update(1) # # for re run # if current_state == 2 : current_state = 1 - if current_state is None: - updated_article = state_manager.affiliation_mining(updated_article) - persist.update_article_by_pmid(updated_article, - updated_article.PMID) - - elif current_state == -1: - updated_article = state_manager.affiliation_mining(updated_article) - persist.update_article_by_pmid(updated_article, - updated_article.PMID) - - elif current_state == 0: - updated_article = state_manager.affiliation_mining(updated_article) - persist.update_article_by_pmid(updated_article, - updated_article.PMID) + if current_state is None or current_state == -1 or current_state == 0: + if method == "Simple": + updated_article = state_manager.affiliation_mining(updated_article) + persist.update_article_by_pmid( + updated_article, updated_article.PMID + ) + elif method == "Titipata": + updated_article = state_manager.affiliation_mining_titipata( + updated_article + ) + persist.update_article_by_pmid( + updated_article, updated_article.PMID + ) - elif current_state == 1: + elif current_state == 1: pass else: @@ -294,8 +273,7 @@ def go_affiliation_mining(): if current_state == 0 or current_state is None: updated_article = Article(**a.copy()) updated_article.State = -1 - persist.update_article_by_pmid(updated_article, - updated_article.PMID) + persist.update_article_by_pmid(updated_article, updated_article.PMID) persist.refresh() exc_type, exc_value, exc_tb = sys.exc_info() print() @@ -310,4 +288,4 @@ def go_affiliation_mining(): logger.ERROR(f"Error {exc_type}") logger.ERROR(f"Error {exc_value}") elapsed = time.time() - start_time - persist.refresh() \ No newline at end of file + persist.refresh() diff --git a/triplea/service/repository/state/__init__.py b/triplea/service/repository/state/__init__.py index b8f513e..2f80fe8 100644 --- a/triplea/service/repository/state/__init__.py +++ b/triplea/service/repository/state/__init__.py @@ -1,19 +1,19 @@ from triplea.service.repository.state.expand_details import expand_details from triplea.service.repository.state.parsing_details import parsing_details + # from triplea.service.repository.state.ner_title import ner_title from triplea.service.repository.state.get_citation import get_citation from triplea.service.repository.state.initial import ( get_article_list_from_pubmed_all_store_to_arepo, ) from triplea.service.repository.state.custom.extract_kg_abstract import ( - extract_triple_abstract_save + extract_triple_abstract_save, ) from triplea.service.repository.state.custom.affiliation_mining import ( - affiliation_mining -) -from triplea.service.repository.state.custom.extract_topic import ( - extract_topic_abstract + affiliation_mining, + affiliation_mining_titipata, ) +from triplea.service.repository.state.custom.extract_topic import extract_topic_abstract __all__ = [ @@ -23,5 +23,6 @@ "get_article_list_from_pubmed_all_store_to_arepo", "extract_triple_abstract_save", "affiliation_mining", + "affiliation_mining_titipata", "extract_topic_abstract", ] diff --git a/triplea/service/repository/state/custom/affiliation_mining.py b/triplea/service/repository/state/custom/affiliation_mining.py index 2a76ad8..26a2249 100644 --- a/triplea/service/repository/state/custom/affiliation_mining.py +++ b/triplea/service/repository/state/custom/affiliation_mining.py @@ -1,12 +1,10 @@ -import sys -from triplea.schemas.article import Article -from triplea.service.click_logger import logger -import triplea.service.repository.persist as persist +from triplea.schemas.article import AffiliationParseMethod, Article from triplea.config.settings import ROOT +import triplea.client.affiliation_parser as client_affiliation_parser country_list = [] -f = open(ROOT.parent / 'datasets' / 'country.txt') +f = open(ROOT.parent / "datasets" / "country.txt") count = 0 while True: count += 1 @@ -16,45 +14,50 @@ break -def _is_email(txt:str) -> bool: - if txt.__contains__('@'): +def _is_email(txt: str) -> bool: + if txt.__contains__("@"): return True else: return False - -def _has_numbers(txt:str): + + +def _has_numbers(txt: str): return any(char.isdigit() for char in txt) -def _is_university(txt:str): +def _is_university(txt: str): if txt.lower().__contains__("university"): return True else: return False - -def _is_center(txt:str): + + +def _is_center(txt: str): if txt.lower().__contains__("center"): return True else: return False - -def _is_department(txt:str): + + +def _is_department(txt: str): if txt.lower().__contains__("department"): return True else: return False -def _is_institute(txt:str): + +def _is_institute(txt: str): if txt.lower().__contains__("institute"): return True else: return False -def _is_hospital(txt:str): + +def _is_hospital(txt: str): """ The function checks if a given string contains the word "hospital" and returns True if it does, otherwise it returns False. - + :param txt: a string that represents a text input that we want to check if it contains the word "hospital" (case insensitive) :type txt: str @@ -65,12 +68,13 @@ def _is_hospital(txt:str): return True else: return False - -def _is_country(txt:str): + + +def _is_country(txt: str): """ This function checks if a given string is a country name by comparing it to a list of countries. - + :param txt: a string that represents a country name or code :type txt: str :return: a boolean value (True or False) depending on whether the input string @@ -79,10 +83,11 @@ def _is_country(txt:str): if country_list.__contains__(txt): return True else: - return False + return False + def affiliation_mining(article: Article): - article.FlagAffiliationMining = 1 + article.FlagAffiliationMining = 1 if article.Authors is not None: for a in article.Authors: if a.Affiliations is not None: @@ -92,7 +97,37 @@ def affiliation_mining(article: Article): return article -def get_affiliation_structured(affiliation_text:str)-> dict: +def affiliation_mining_titipata(article: Article): + article.FlagAffiliationMining = 1 + if article.Authors is not None: + for a in article.Authors: + if a.Affiliations is not None: + for aff in a.Affiliations: + affl_normal_text = aff.Text.replace("/", " ") + + affl = client_affiliation_parser.parse_affiliation(affl_normal_text) + loc = [] + if "country" in affl: + loc.append({"country": affl["country"]}) + if "department" in affl: + loc.append({"department": affl["department"]}) + if "email" in affl: + loc.append({"email": affl["email"]}) + if "institution" in affl: + loc.append({"institution": affl["institution"]}) + if "location" in affl: + loc.append({"location": affl["location"]}) + if "zipcode" in affl: + loc.append({"zipcode": affl["zipcode"]}) + + # loc.append({"method" : "Titipata"}) + aff.ParseMethod = AffiliationParseMethod.TITIPATA_API + aff.Structural = loc + + return article + + +def get_affiliation_structured(affiliation_text: str) -> dict: """ Extracts structured information from an affiliation text. @@ -109,30 +144,30 @@ def get_affiliation_structured(affiliation_text:str)-> dict: # Output: [{'university': 'University of XYZ'}, {'department': 'Department of Computer Science'}, {'country': 'Country XYZ'}] """ if affiliation_text is None or affiliation_text == "": - return + return loc = [] aff_part = affiliation_text.split(",") aff_part_number = len(aff_part) country_exist = False - n=0 + n = 0 for p in aff_part: if _is_university(p): - loc.append({ "university" : p.strip()}) + loc.append({"university": p.strip()}) n = n + 1 elif _is_center(p): - loc.append({ "center" : p.strip()}) + loc.append({"center": p.strip()}) n = n + 1 elif _is_department(p): - loc.append({ "department" : p.strip()}) + loc.append({"department": p.strip()}) n = n + 1 elif _is_institute(p): - loc.append({ "institute" : p.strip()}) + loc.append({"institute": p.strip()}) n = n + 1 elif _is_hospital(p): - loc.append({ "hospital" : p.strip()}) + loc.append({"hospital": p.strip()}) n = n + 1 - elif _is_country(p.replace('.', '').strip()): - loc.append({ "country" : p.replace('.', '').strip()}) + elif _is_country(p.replace(".", "").strip()): + loc.append({"country": p.replace(".", "").strip()}) country_exist = True n = n + 1 else: @@ -144,11 +179,10 @@ def get_affiliation_structured(affiliation_text:str)-> dict: # print(loc) # print(affiliation_text) # print(aff_part_number - n) - if country_exist == False: - loc.append({ "country" : "NaN"}) - - return loc + if country_exist is False: + loc.append({"country": "NaN"}) + return loc def get_structured_affiliation(article: Article): @@ -163,7 +197,7 @@ def get_structured_affiliation(article: Article): # This Method fo R&D def affiliation_mining1(article: Article): - article.FlagAffiliationMining = 0 # Critical + article.FlagAffiliationMining = 0 # Critical if article.Authors is not None: for a in article.Authors: if a.Affiliations is not None: @@ -172,22 +206,22 @@ def affiliation_mining1(article: Article): aff_part = aff.Text.split(",") aff_part_number = len(aff_part) if aff_part_number > 3: - end_pointer = 1 + end_pointer = 1 country = aff_part[aff_part_number - (end_pointer)] if _is_email(country): email = country - end_pointer = end_pointer + 1 + end_pointer = end_pointer + 1 usename = email.split("@")[0] - if usename.__contains__(' '): + if usename.__contains__(" "): # print("مشکل") - country = "USA" # Critical بعدا درست می کنم + country = "USA" # Critical بعدا درست می کنم else: country = aff_part[aff_part_number - (end_pointer)] # print(email) - + city = aff_part[aff_part_number - (end_pointer + 1)] - country = country.replace('.', '') + country = country.replace(".", "") country = country.strip() if country_list.__contains__(country): pass @@ -196,17 +230,17 @@ def affiliation_mining1(article: Article): pass else: print() - print(f'Country : {country}') - print(aff.Text) + print(f"Country : {country}") + print(aff.Text) # print(f'City : {city}') part3 = aff_part[aff_part_number - (end_pointer + 2)] # print(f'p3 : {part3}') - if part3.__contains__('University'): + if part3.__contains__("University"): university = part3 - elif part3.__contains__('Hospital'): + elif part3.__contains__("Hospital"): hospital = part3 - elif part3.__contains__('Institute'): + elif part3.__contains__("Institute"): institute = part3 else: pass @@ -214,13 +248,7 @@ def affiliation_mining1(article: Article): # print(aff.Text) # raise NotImplementedError - else: # aff_part_number < 3 + else: # aff_part_number < 3 pass - - - - - - - return article \ No newline at end of file + return article diff --git a/triplea/service/repository/state/custom/extract_kg_abstract.py b/triplea/service/repository/state/custom/extract_kg_abstract.py index 345a8e6..ba3b54b 100644 --- a/triplea/service/repository/state/custom/extract_kg_abstract.py +++ b/triplea/service/repository/state/custom/extract_kg_abstract.py @@ -1,24 +1,17 @@ -import sys - from triplea.schemas.article import Article -from triplea.service.click_logger import logger # from triplea.service.nlp.triple_extract import extract_triples # Expire Module from triplea.client.triple_extraction import extract_triple import triplea.service.repository.persist as persist + def extract_triple_abstract_save(article: Article): article.FlagExtractKG = 1 if article.Abstract is not None: # triples_list = extract_triples(article.Abstract) # Expire Module triples_list = extract_triple(article.Abstract) - triples_list = [] # CRITICAL must be API + triples_list = [] # CRITICAL must be API for t in triples_list: - t['PMID'] = article.PMID + t["PMID"] = article.PMID persist.create_triple(t) return article - - - - - diff --git a/triplea/service/repository/state/custom/extract_topic.py b/triplea/service/repository/state/custom/extract_topic.py index 708f4f1..96240eb 100644 --- a/triplea/service/repository/state/custom/extract_topic.py +++ b/triplea/service/repository/state/custom/extract_topic.py @@ -2,8 +2,8 @@ from triplea.client.topic_extraction import extract_topic from triplea.schemas.article import Article from triplea.service.click_logger import logger -# from triplea.service.nlp.topic_extract import extract_textrank +# from triplea.service.nlp.topic_extract import extract_textrank def extract_topic_abstract(article: Article): @@ -17,8 +17,8 @@ def extract_topic_abstract(article: Article): abstract = "" else: abstract = article.Abstract - - text = title + ' ' + abstract + + text = title + " " + abstract text = text.replace("\n", "") try: result = extract_topic(text, "textrank") @@ -32,11 +32,8 @@ def extract_topic_abstract(article: Article): logger.ERROR(f"Error {exc_value}") article.FlagExtractTopic = -1 - - # Expire Module - # topic_list = [] # topic_list_phrase = [] # if article.Abstract is not None: @@ -44,7 +41,7 @@ def extract_topic_abstract(article: Article): # # print() # # print(f"Title : {article.Title}") - + # if topic_list_phrase is not None: # for t in topic_list_phrase: @@ -54,7 +51,7 @@ def extract_topic_abstract(article: Article): # # print(type(t)) # # print(t) # # print(t.text) - + # # print(topic_list) # article.Topics = topic_list return article diff --git a/triplea/service/repository/state/get_citation.py b/triplea/service/repository/state/get_citation.py index 7e901f3..72e0ac7 100644 --- a/triplea/service/repository/state/get_citation.py +++ b/triplea/service/repository/state/get_citation.py @@ -2,7 +2,7 @@ from triplea.client.pubmed import get_cited_article_from_pubmed from triplea.schemas.article import Article from triplea.service.click_logger import logger -import triplea.service.repository.persist as persist + def get_citation(article: Article): diff --git a/triplea/service/repository/state/initial.py b/triplea/service/repository/state/initial.py index f5b4de8..f5f4fa3 100644 --- a/triplea/service/repository/state/initial.py +++ b/triplea/service/repository/state/initial.py @@ -8,11 +8,15 @@ def _save_article_pmid_list_in_arepo(data: dict) -> None: """ - > If the data is in the right format, then for each PMID in the data, insert the PMID into the - knowledge repository. If the PMID is not a duplicate, then log the PMID as added to the knowledge + > If the data is in the right format, then for each PMID in the data, + insert the PMID into the + knowledge repository. If the PMID is not a duplicate, + then log the PMID as added to the knowledge repository - :param data: The output format from the pubmed service is for a list of PMIDs that is output from the `get_article_list_from_pubmed` method. + :param data: The output format from the pubmed service is for + a list of PMIDs that is output from + the `get_article_list_from_pubmed` method. :type data: dict """ if "esearchresult" in data: @@ -29,7 +33,6 @@ def _save_article_pmid_list_in_arepo(data: dict) -> None: if i is None: # PMID is Duplicate logger.INFO(f"{pmid} is exist in knowledge repository. ({n})") else: - # logger.INFO('add ' + pmid + ' to knowledge repository. get ' + str(i)) logger.INFO(f"add {pmid} to knowledge repository. ({n})") else: persist.refresh() @@ -37,7 +40,6 @@ def _save_article_pmid_list_in_arepo(data: dict) -> None: persist.refresh() - def get_article_list_from_pubmed_all_store_to_arepo( searchterm: str, tps_limit: Optional[int] = 1, @@ -45,16 +47,19 @@ def get_article_list_from_pubmed_all_store_to_arepo( retmax: Optional[int] = 10000, ) -> None: """ - It takes a search term, and returns a list of all the articles that match that search term + It takes a search term, + and returns a list of all the articles that match that search term :param searchterm: The search term you want to use to search PubMed :type searchterm: str :param tps_limit: The number of requests per second, defaults to 1 :type tps_limit: Optional[int] (optional) - :param big_ret: If True, the function will return a maximum of 10,000 records. If False, it will + :param big_ret: If True, the function will return a maximum + of 10,000 records. If False, it will return a maximum of 20 records, defaults to True :type big_ret: Optional[bool] (optional) - :param retmax: The number of articles to return per request, defaults to 10000 + :param retmax: The number of articles to return per request, + defaults to 10000 :type retmax: Optional[int] (optional) """ sleep_time = 1 // tps_limit @@ -97,17 +102,9 @@ def get_article_list_from_pubmed_all_store_to_arepo( # for last round start = ((i + 1) * retmax) - retmax mid = total - (retmax * round) - logger.INFO( - "Round (" - + str(i + 1) - + ") : " - + "Get another " - + str(mid) - + " record (total " - + str(total) - + " record)", - deep=13, - ) + logger.INFO(f"""Round ({str(i + 1)}): + Get another {str(mid)} record (total {str(total)} record)""", + deep=13) chunkdata = get_article_list_from_pubmed(start, retmax, searchterm) _save_article_pmid_list_in_arepo(chunkdata) @@ -121,6 +118,7 @@ def get_article_list_from_pubmed_all_store_to_arepo( start = 1 retmax = 10000 # searchterm = '"breast neoplasms"[MeSH Terms] OR ("breast"[All Fields] AND "neoplasms"[All Fields]) OR "breast neoplasms"[All Fields] OR ("breast"[All Fields] AND "cancer"[All Fields]) OR "breast cancer"[All Fields]' - searchterm = '((Bibliometric analysis[MeSH Terms])) OR ("Bibliometric analysis"[Title/Abstract])' + # searchterm = '((Bibliometric analysis[MeSH Terms])) OR ("Bibliometric analysis"[Title/Abstract])' + searchterm = '"Rajaie Cardiovascular"[Affiliation]' chunkdata = get_article_list_from_pubmed(start, retmax, searchterm) _save_article_pmid_list_in_arepo(chunkdata) diff --git a/triplea/service/repository/state/parsing_details.py b/triplea/service/repository/state/parsing_details.py index 4d7893e..8f6590c 100644 --- a/triplea/service/repository/state/parsing_details.py +++ b/triplea/service/repository/state/parsing_details.py @@ -50,8 +50,8 @@ def _convert_dict_to_class_author(data: dict) -> Author: """ if "CollectiveName" in data: my_author = Author() - if '#text' in data["CollectiveName"]: - my_author.FullName = data["CollectiveName"]['#text'] + if "#text" in data["CollectiveName"]: + my_author.FullName = data["CollectiveName"]["#text"] else: my_author.FullName = data["CollectiveName"] my_author.HashID = str(hash(my_author.FullName)) @@ -93,10 +93,15 @@ def _convert_dict_to_class_keyword(data: dict) -> Keyword: :return: A Keyword object """ my_keyword = Keyword() - if '#text' in data: + if "#text" in data: my_keyword.Text = data["#text"] else: - my_keyword.Text = data["i"] # in PMID 37283018 + if "i" in data: + my_keyword.Text = data["i"] # in PMID 37283018 + else: # in 34358588 + print() + print("Warning in _convert_dict_to_class_keyword line 103.") + my_keyword.Text = "" if "," in my_keyword.Text: pass @@ -109,9 +114,11 @@ def _convert_dict_to_class_keyword(data: dict) -> Keyword: my_keyword.IS_Mesh = False return my_keyword + def _convert_dict_to_reffrence(): pass + def parsing_details(article: Article) -> Article: article.State = 2 backward_state = -1 @@ -120,7 +127,7 @@ def parsing_details(article: Article) -> Article: if data is None: print() logger.ERROR( - f"""Error in Original Article data. It is Null. + f"""Error in Original Article data. It is Null. PMID = {article.PMID}""" ) article.State = backward_state @@ -165,24 +172,26 @@ def parsing_details(article: Article) -> Article: elif a_id["@IdType"] == "pmc": article.PMC = a_id["#text"] else: - print() - print(f'article() id type unhandel: {a_id["@IdType"]}') + pass + # print() + # print(f'article() id type unhandel: {a_id["@IdType"]}') elif type(ArticleId) == dict: if ArticleId["@IdType"] == "doi": article.DOI = a_id["#text"] elif ArticleId["@IdType"] == "pmc": article.PMC = a_id["#text"] else: - print() - print(f'article id type unhandel: {a_id["@IdType"]}') + pass + # print() + # print(f'article id type unhandel: {a_id["@IdType"]}') else: raise NotImplementedError # Update Article Title & Journal Title. - pubmed_article_data = data["PubmedArticleSet"]["PubmedArticle"][ - "MedlineCitation" - ]["Article"] + pubmed_article_data = data["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][ + "Article" + ] article.Title = pubmed_article_data["ArticleTitle"] if type(article.Title) == dict: article.Title = pubmed_article_data["ArticleTitle"]["#text"] @@ -214,9 +223,7 @@ def parsing_details(article: Article) -> Article: raise NotImplementedError # Creating a list of keywords. Merging Mesh List & Keyword List - medline_citation = data["PubmedArticleSet"]["PubmedArticle"][ - "MedlineCitation" - ] + medline_citation = data["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"] keyword_list = [] if "MeshHeadingList" in medline_citation: if type(medline_citation["MeshHeadingList"]["MeshHeading"]) == list: @@ -267,15 +274,13 @@ def parsing_details(article: Article) -> Article: article.ReferenceCrawlerDeep = 0 reference_list = [] - - if isinstance(PubmedData["ReferenceList"],list): + + if isinstance(PubmedData["ReferenceList"], list): print(PubmedData["ReferenceList"]) for ref in PubmedData["ReferenceList"]: if "ArticleIdList" in ref: if type(ref["ArticleIdList"]["ArticleId"]) == dict: - if ref["ArticleIdList"]["ArticleId"][ - "@IdType" - ] == "pubmed": + if ref["ArticleIdList"]["ArticleId"]["@IdType"] == "pubmed": reference_list.append( ref["ArticleIdList"]["ArticleId"]["#text"] ) @@ -286,16 +291,13 @@ def parsing_details(article: Article) -> Article: reference_list.append(ref_id["#text"]) else: raise NotImplementedError - else: if type(PubmedData["ReferenceList"]["Reference"]) == dict: ref = PubmedData["ReferenceList"]["Reference"] if "ArticleIdList" in ref: if type(ref["ArticleIdList"]["ArticleId"]) == dict: - if ref["ArticleIdList"]["ArticleId"][ - "@IdType" - ] == "pubmed": + if ref["ArticleIdList"]["ArticleId"]["@IdType"] == "pubmed": reference_list.append( ref["ArticleIdList"]["ArticleId"]["#text"] ) @@ -310,9 +312,7 @@ def parsing_details(article: Article) -> Article: for ref in PubmedData["ReferenceList"]["Reference"]: if "ArticleIdList" in ref: if type(ref["ArticleIdList"]["ArticleId"]) == dict: - if ref["ArticleIdList"]["ArticleId"][ - "@IdType" - ] == "pubmed": + if ref["ArticleIdList"]["ArticleId"]["@IdType"] == "pubmed": reference_list.append( ref["ArticleIdList"]["ArticleId"]["#text"] ) @@ -335,10 +335,7 @@ def parsing_details(article: Article) -> Article: ) new_rcd = article.ReferenceCrawlerDeep - 1 for ref_pmid in reference_list: - persist.insert_new_pmid(pmid=ref_pmid, - reference_crawler_deep=new_rcd) - - + persist.insert_new_pmid(pmid=ref_pmid, reference_crawler_deep=new_rcd) if "AuthorList" in pubmed_article_data: author_list = [] @@ -356,9 +353,7 @@ def parsing_details(article: Article) -> Article: article.Authors = author_list else: logger.WARNING( - f"Article {article.PMID} has no AuthorList", - forecolore="white", - deep=5 + f"Article {article.PMID} has no AuthorList", forecolore="white", deep=5 ) return article diff --git a/triplea/the_private_backyard.py b/triplea/the_private_backyard.py index 349aab5..b398568 100644 --- a/triplea/the_private_backyard.py +++ b/triplea/the_private_backyard.py @@ -1,3 +1,6 @@ +# flake8: noqa + + import sys import time import networkx as nx @@ -22,7 +25,9 @@ # import triplea.service.graph.export as gexport from triplea.service.graph.extract import Emmanuel, check_upper_term, _t_emmanuel from triplea.service.click_logger import logger -from triplea.service.graph.extract.country_based_co_authorship import graph_extract_article_country +from triplea.service.graph.extract.country_based_co_authorship import ( + graph_extract_article_country, +) import triplea.service.repository.persist as persist from triplea.service.repository.pipeline_core import move_state_forward from triplea.service.repository.pipeline_flag import go_extract_topic, go_extract_triple @@ -30,7 +35,6 @@ from triplea.service.repository.state.custom.affiliation_mining import country_list - def check_map_topic(): f = open("one-graph.json") data = json.load(f) @@ -68,7 +72,6 @@ def check_map_topic(): ganaliz.info(G) - if __name__ == "__main__": pass @@ -77,8 +80,6 @@ def check_map_topic(): # proccess_bar = False # output_file = "topic.json" - - # l_nodes = [] # l_edges = [] # graphdict = gextract.graph_extractor( @@ -97,7 +98,7 @@ def check_map_topic(): # logger.DEBUG("Remove duplication in Nodes & Edges. ") # n = gextract.thefourtheye_2(l_nodes) # e = gextract.thefourtheye_2(l_edges) - + # n = list(n) # e = list(e) # graphdict = {"nodes": n, "edges": e} @@ -109,10 +110,64 @@ def check_map_topic(): # with open(output_file, "w") as outfile: # outfile.write(data1) - - import visualization.gdatarefresh as graphdatarefresh - file = "topic1.json" - with open(file, "r") as f: - graphdict = json.load(f) - graphdatarefresh.refresh_interactivegraph(graphdict) - graphdatarefresh.refresh_alchemy(graphdict) \ No newline at end of file + # import visualization.gdatarefresh as graphdatarefresh + # file = "topic1.json" + # with open(file, "r") as f: + # graphdict = json.load(f) + # graphdatarefresh.refresh_interactivegraph(graphdict) + # graphdatarefresh.refresh_alchemy(graphdict) + + l_pmid = persist.get_all_article_pmid_list() + total_article_in_current_state = len(l_pmid) + number_of_article_move_forward = 0 + logger.DEBUG(str(len(l_pmid)) + " Article(s) is in FlagAffiliationMining " + str(0)) + + bar = click.progressbar(length=len(l_pmid), show_pos=True, show_percent=True) + + refresh_point = 0 + for id in l_pmid: + start_time = time.time() + try: + number_of_article_move_forward = number_of_article_move_forward + 1 + + if refresh_point == 50: + refresh_point = 0 + persist.refresh() + print() + logger.INFO( + f"There are {str(total_article_in_current_state - number_of_article_move_forward)} article(s) left ", + forecolore="yellow", + ) + else: + refresh_point = refresh_point + 1 + + a = persist.get_article_by_pmid(id) + try: + updated_article = Article(**a.copy()) + except Exception: + print() + print(logger.ERROR(f"Error in parsing article. PMID = {id}")) + raise Exception("Article Not Parsed.") + + if updated_article.Authors is not None: + for a in updated_article.Authors: + if a.Affiliations is not None: + for aff in a.Affiliations: + aff.Structural = None + + persist.update_article_by_pmid(updated_article, updated_article.PMID) + + # logger.DEBUG('Article ' + updated_article.PMID + ' with state ' + str(current_state) + ' forward to ' + str(current_state + 1)) + bar.label = "Article " + updated_article.PMID + " affiliation mining." + bar.update(1) + # # for re run + # if current_state == 2 : current_state = 1 + + except Exception: + persist.refresh() + exc_type, exc_value, exc_tb = sys.exc_info() + print() + print(exc_tb.tb_lineno) + logger.ERROR(f"Error {exc_type}") + logger.ERROR(f"Error {exc_value}") + persist.refresh() diff --git a/triplea/the_private_backyard1.py b/triplea/the_private_backyard1.py index 6762758..c152fb4 100644 --- a/triplea/the_private_backyard1.py +++ b/triplea/the_private_backyard1.py @@ -1,3 +1,6 @@ +# flake8: noqa +# noqa: F401 + import click import time import sys @@ -26,22 +29,19 @@ db = client["articledata"] col_article = db["articledata"] - myquery = {"$or":[ - {"Topics": re.compile('.*biobank.*', re.IGNORECASE) }, - {"Topics": re.compile('.*biobank.*', re.IGNORECASE) }, - {"Topics": re.compile('.*bio-bank.*', re.IGNORECASE) }, - + myquery = { + "$or": [ + {"Topics": re.compile(".*biobank.*", re.IGNORECASE)}, + {"Topics": re.compile(".*biobank.*", re.IGNORECASE)}, + {"Topics": re.compile(".*bio-bank.*", re.IGNORECASE)}, ] - } + } cursor = col_article.find(myquery, projection={"PMID": "$PMID", "_id": 0}) l_pmid = [] for a in list(cursor): - l_pmid.append(a['PMID']) + l_pmid.append(a["PMID"]) logger.DEBUG(f"{str(len(l_pmid))} Article(s) Selected.") - - - total_article_in_current_state = len(l_pmid) number_of_article_move_forward = 0 @@ -49,7 +49,11 @@ nodes = [] edges = [] csv = "" - csv = csv + """key,title,authors,issn,volume,issue,pages,year,publisher,url,abstract,notes,doi,keywords""" + "\n" + csv = ( + csv + + """key,title,authors,issn,volume,issue,pages,year,publisher,url,abstract,notes,doi,keywords""" + + "\n" + ) n = 0 for id in l_pmid: try: @@ -71,36 +75,39 @@ print() print(logger.ERROR(f"Error in parsing article. PMID = {id}")) raise Exception("Article Not Parsed.") - #------------------Select ---------------- - if updated_article.Title.__contains__("biobank") or updated_article.Title.__contains__("Biobank"): - n=n+1 + # ------------------Select ---------------- + if updated_article.Title.__contains__( + "biobank" + ) or updated_article.Title.__contains__("Biobank"): + n = n + 1 if updated_article.Title.__contains__(","): - title = updated_article.Title.replace('"', ' ') - title = f'"{title}"' + title = updated_article.Title.replace('"', " ") + title = f'"{title}"' else: title = updated_article.Title - + authors = "" issn = "" volume = "" issue = "" pages = "" try: - year = updated_article.OreginalArticle['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['ArticleDate']['Year'] + year = updated_article.OreginalArticle["PubmedArticleSet"][ + "PubmedArticle" + ]["MedlineCitation"]["Article"]["ArticleDate"]["Year"] except: year = "" publisher = "" - url= f"https://pubmed.ncbi.nlm.nih.gov/{updated_article.PMID}/" + url = f"https://pubmed.ncbi.nlm.nih.gov/{updated_article.PMID}/" if updated_article.Abstract.__contains__(","): - abstract = updated_article.Abstract.replace('"', ' ') - abstract = f'"{abstract}"' + abstract = updated_article.Abstract.replace('"', " ") + abstract = f'"{abstract}"' else: abstract = updated_article.Abstract notes = "" doi = "" keywords = "" - for au in updated_article.Authors: authors = authors + au.FullName + "," @@ -109,26 +116,28 @@ for k in updated_article.Keywords: keywords = keywords + k.Text + ";" - + if keywords != "": if keywords.__contains__(","): keywords = f'"{keywords[:-1]}"' + csv = ( + csv + + f"""{n},{title},{authors},{issn},{volume},{issue},{pages},{year},{publisher},{url},{abstract},{notes},{doi},{keywords}""" + + "\n" + ) - csv = csv + f"""{n},{title},{authors},{issn},{volume},{issue},{pages},{year},{publisher},{url},{abstract},{notes},{doi},{keywords}""" + "\n" - - - #------------------Select ---------------- + # ------------------Select ---------------- except Exception: - exc_type, exc_value, exc_tb = sys.exc_info() - print() - print(exc_tb.tb_lineno) - logger.ERROR(f"Error {exc_type}") - logger.ERROR(f"Error {exc_value}") - traceback.print_tb(exc_tb) + exc_type, exc_value, exc_tb = sys.exc_info() + print() + print(exc_tb.tb_lineno) + logger.ERROR(f"Error {exc_type}") + logger.ERROR(f"Error {exc_value}") + traceback.print_tb(exc_tb) - print(os.path.join('/path/to/Documents',"completeName")) + print(os.path.join("/path/to/Documents", "completeName")) with open("rayyan.csv", "w", encoding="utf-8") as file1: file1.write(csv) - logger.INFO("Export Complete.") \ No newline at end of file + logger.INFO("Export Complete.") diff --git a/triplea/the_private_backyard2.py b/triplea/the_private_backyard2.py index bc70252..09fe1d8 100644 --- a/triplea/the_private_backyard2.py +++ b/triplea/the_private_backyard2.py @@ -1,3 +1,5 @@ +# flake8: noqa +# noqa: F401 import click import time @@ -6,13 +8,16 @@ import re import networkx as nx from pymongo import MongoClient -from triplea.config.settings import SETTINGS,ROOT +from triplea.client.affiliation_parser import parse_affiliation +from triplea.config.settings import SETTINGS, ROOT from triplea.service.click_logger import logger from triplea.schemas.article import Article from triplea.schemas.node import Node from triplea.service.graph.analysis.info import info from triplea.service.repository.export.llm import export_pretrain_llm_in_dir -from triplea.service.repository.export.triplea_format import export_triplea_csvs_in_relational_mode_save_file +from triplea.service.repository.export.triplea_format import ( + export_triplea_csvs_in_relational_mode_save_file, +) import triplea.service.repository.persist as persist import triplea.service.graph.export.export as gexport import triplea.service.graph.analysis.ganalysis as ganaliz @@ -20,25 +25,45 @@ import os from triplea.service.repository.pipeline_core import move_state_forward -from triplea.service.repository.pipeline_flag import go_affiliation_mining, go_extract_topic -from triplea.service.repository.state.custom.affiliation_mining import get_affiliation_structured +from triplea.service.repository.pipeline_flag import ( + go_affiliation_mining, + go_extract_topic, +) +from triplea.service.repository.state.custom.affiliation_mining import ( + get_affiliation_structured, +) if __name__ == "__main__": pass - # export_triplea_csvs_in_relational_mode_save_file('export.csv', - # proccess_bar=True,limit_sample=0) + export_triplea_csvs_in_relational_mode_save_file( + "export.csv", proccess_bar=True, limit_sample=0 + ) + # move_state_forward(2) # go_affiliation_mining() # persist.change_flag_extract_topic(1,0) # go_extract_topic(proccess_bar=True) + # aff_text = "Institute for Molecular Medicine Finland (FIMM), Helsinki Institute of Life Science (HiLIFE), University of Helsinki, Helsinki, Finland. aarno.palotie@helsinki.fi" + # aff_text = "Department of Neurology and Institute of Neurology, Huashan Hospital, State Key Laboratory of Medical Neurobiology and MOE Frontiers Center for Brain Science, Shanghai Medical College, Fudan University, National Center for Neurological Disorders, Shanghai, China. jintai_yu@fudan.edu.cn" + # aff_text = "Department of Ophthalmology, University of Washington, Seattle, Washington, USA" + # print(get_affiliation_structured(aff_text)) + + # from triplea.service.repository.state.custom.affiliation_mining import _is_country + # print(_is_country("Finland. aarno.palotie@helsinki.fi")) + # print(_is_country("Finland")) + + # print(parse_affiliation(aff_text)) + + # go_affiliation_mining(method='Titipata') + + # import triplea.service.repository.state as state_manager + # a = persist.get_article_by_pmid('31679581') + # updated_article = Article(**a.copy()) + # state_manager.affiliation_mining_titipata(updated_article) - aff_text = "Institute for Molecular Medicine Finland (FIMM), Helsinki Institute of Life Science (HiLIFE), University of Helsinki, Helsinki, Finland. aarno.palotie@helsinki.fi" - aff_text = "Department of Neurology and Institute of Neurology, Huashan Hospital, State Key Laboratory of Medical Neurobiology and MOE Frontiers Center for Brain Science, Shanghai Medical College, Fudan University, National Center for Neurological Disorders, Shanghai, China. jintai_yu@fudan.edu.cn" - aff_text = "Department of Ophthalmology, University of Washington, Seattle, Washington, USA" - print(get_affiliation_structured(aff_text)) - - from triplea.service.repository.state.custom.affiliation_mining import _is_country - print(_is_country("Finland. aarno.palotie@helsinki.fi")) - print(_is_country("Finland")) + # import triplea.service.repository.state as state_manager + # a = persist.get_article_by_pmid('34358588') + # updated_article = Article(**a.copy()) + # state_manager.parsing_details(updated_article) diff --git a/triplea/the_private_backyard3.py b/triplea/the_private_backyard3.py index fbafca4..990d4c9 100644 --- a/triplea/the_private_backyard3.py +++ b/triplea/the_private_backyard3.py @@ -1,6 +1,7 @@ +# flake8: noqa +# noqa: F401 - - +import array from pymongo import MongoClient from tests.fixtures.graph_52 import graph52_instance from triplea.cli import export_graph @@ -10,7 +11,10 @@ from triplea.service.repository.import_file.triplea import import_triplea_json -from triplea.service.graph.analysis.ganalysis import get_avg_shortest_path_length_per_node, get_clustering_coefficient_per_node +from triplea.service.graph.analysis.ganalysis import ( + get_avg_shortest_path_length_per_node, + get_clustering_coefficient_per_node, +) import networkx as nx from triplea.utils.general import safe_csv @@ -18,20 +22,3 @@ if __name__ == "__main__": pass - # text='Schizophrenia, "Just the Facts": what we know in 2008 part 1: overview ' - # print(safe_csv(text)) - - _connection_url = SETTINGS.AAA_MONGODB_CONNECTION_URL - client = MongoClient(_connection_url) - db = client[SETTINGS.AAA_MONGODB_DB_NAME] - col_article = db["articledata"] - col_nodes = db["nodes"] - col_edges = db["edges"] - col_triple = db["triple"] - myquery = {"FlagExtractTopic": 0} - sett = {"$set": {"Topics": []}} - r = col_article.update_many(myquery, sett) - - - - diff --git a/triplea/the_private_backyard_mongodb.py b/triplea/the_private_backyard_mongodb.py new file mode 100644 index 0000000..fc25e87 --- /dev/null +++ b/triplea/the_private_backyard_mongodb.py @@ -0,0 +1,89 @@ +# flake8: noqa + +from pymongo import MongoClient +from triplea.config.settings import SETTINGS + + +def get_flag(): + _connection_url = SETTINGS.AAA_MONGODB_CONNECTION_URL + client = MongoClient(_connection_url) + db = client[SETTINGS.AAA_MONGODB_DB_NAME] + col_article = db["articledata"] + pipeline = [ + {"$group": {"_id": {"State": "$State"}, "COUNT(_id)": {"$sum": 1}}}, + {"$project": {"State": "$_id.State", "n": "$COUNT(_id)", "_id": 0}}, + ] + pipeline = [ + { + "$group": { + "_id": {"FlagAffiliationMining": "$FlagAffiliationMining"}, + "COUNT(_id)": {"$sum": 1}, + } + }, + { + "$project": { + "FlagAffiliationMining": "$_id.FlagAffiliationMining", + "n": "$COUNT(_id)", + "_id": 0, + } + }, + ] + print(list(col_article.aggregate(pipeline))) + + +def change(): + _connection_url = SETTINGS.AAA_MONGODB_CONNECTION_URL + client = MongoClient(_connection_url) + db = client[SETTINGS.AAA_MONGODB_DB_NAME] + col_article = db["articledata"] + myquery = {"FlagAffiliationMining": 1} + sett = {"$set": {"FlagAffiliationMining": 0}} + r = col_article.update_many(myquery, sett) + + +def change_CiteCrawlerDeep(): + _connection_url = SETTINGS.AAA_MONGODB_CONNECTION_URL + client = MongoClient(_connection_url) + db = client[SETTINGS.AAA_MONGODB_DB_NAME] + col_article = db["articledata"] + # col_nodes = db["nodes"] + # col_edges = db["edges"] + # col_triple = db["triple"] + myquery = {"CiteCrawlerDeep": 0} + sett = {"$set": {"CiteCrawlerDeep": 1}} + r = col_article.update_many(myquery, sett) + + +def change_State(): + _connection_url = SETTINGS.AAA_MONGODB_CONNECTION_URL + client = MongoClient(_connection_url) + db = client[SETTINGS.AAA_MONGODB_DB_NAME] + col_article = db["articledata"] + myquery = {"State": 3} + sett = {"$set": {"State": 2}} + r = col_article.update_many(myquery, sett) + + +def change_complex(): + _connection_url = SETTINGS.AAA_MONGODB_CONNECTION_URL + client = MongoClient(_connection_url) + db = client[SETTINGS.AAA_MONGODB_DB_NAME] + col_article = db["articledata"] + # myquery = {"FlagAffiliationMining": 1} + # sett = {"$set": {"FlagAffiliationMining": 0}} + # r = col_article.update_many(myquery, sett) + + myquery = {"FlagAffiliationMining": 0, "Authors.Affiliations": {"$ne": "null"}} + sett = {"$unset": {"Authors.$[author].Affiliations.$[affil].Structural": ""}} + filter = [ + {"author.Affiliations": {"$exists": True}}, + {"affil.Structural": {"$exists": True}}, + ] + + r = col_article.update_many(myquery, sett, array_filters=filter) + + +if __name__ == "__main__": + pass + # change() + change_State() diff --git a/triplea/utils/__init__.py b/triplea/utils/__init__.py index dae205c..80888d7 100644 --- a/triplea/utils/__init__.py +++ b/triplea/utils/__init__.py @@ -2,4 +2,4 @@ __all__ = [ "safe_csv", -] \ No newline at end of file +] diff --git a/triplea/utils/general.py b/triplea/utils/general.py index 2e124bd..5c9e8bf 100644 --- a/triplea/utils/general.py +++ b/triplea/utils/general.py @@ -1,15 +1,11 @@ - - - - -def safe_csv(text:str) -> str: +def safe_csv(text: str) -> str: if text is None: return "" if text.__contains__(","): if text.__contains__('"'): text = text.replace('"', "'") - text = f'"{text[:-1]}"' + text = f'"{text[:-1]}"' else: - text = f'"{text[:-1]}"' + text = f'"{text[:-1]}"' - return text \ No newline at end of file + return text