From 55ec00dafb2ec938a96f525b7ded0fa223ccea60 Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Fri, 8 Sep 2023 22:04:42 -0700 Subject: [PATCH] chore: add warning about java options Follow up for #356 --- docs/faq.rst | 6 ++++++ tabula/io.py | 8 ++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/faq.rst b/docs/faq.rst index fb93e14..c0f3889 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -184,6 +184,12 @@ No. Sometimes, you might see a message like `` Jul 17, 2019 10:21:25 AM org.apache.pdfbox.pdmodel.font.PDType1Font WARNING: Using fallback font NimbusSanL-Regu for Univers. Nothing was parsed from this one.`` This error message came from Apache PDFBox which is used under tabula-java, and this is caused by the PDF itself. Neither tabula-py nor tabula-java can't handle the warning itself, except for the silent option that suppresses the warning. +``java_options`` is ignored once ``read_pdf`` or similar funcion is called. +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Since jpype doesn't support changing JVM options after the JVM is started, ``java_options`` is ignored once ``read_pdf`` or similar funcion is called. If you want to change JVM options, you need to restart the Python process. +See also: https://jpype.readthedocs.io/en/latest/api.html#jpype.shutdownJVM + + I can't figure out accurate extraction with tabula-py. Are there any similar Python libraries? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/tabula/io.py b/tabula/io.py index 33636fa..5f29fd4 100644 --- a/tabula/io.py +++ b/tabula/io.py @@ -66,6 +66,8 @@ def _run( global _tabula_vm if not _tabula_vm: _tabula_vm = TabulaVm(java_options, options.silent) + elif java_options: + logger.warning("java_options is ignored until rebooting the Python process.") return _tabula_vm.call_tabula_java(options, path) @@ -151,7 +153,7 @@ def read_pdf( encoding (str, optional): Encoding type for pandas. Default: ``utf-8`` java_options (list, optional): - Set java options. + Set java options. This option will be ignored once JVM is launched. Example: ``["-Xmx256m"]`` @@ -503,6 +505,7 @@ def read_pdf_with_template( Encoding type for pandas. Default is 'utf-8' java_options (list, optional): Set java options like ``["-Xmx256m"]``. + This option will be ignored once JVM is launched. user_agent (str, optional): Set a custom user-agent when download a pdf from a url. Otherwise it uses the default ``urllib.request`` user-agent. @@ -732,7 +735,7 @@ def convert_into( Output format of this function (``csv``, ``json`` or ``tsv``). Default: ``csv`` java_options (list, optional): - Set java options + Set java options. This option will be ignored once JVM is launched. Example: ``"-Xmx256m"``. @@ -866,6 +869,7 @@ def convert_into_by_batch( Output format of this function (csv, json or tsv) java_options (list, optional): Set java options like `-Xmx256m`. + This option will be ignored once JVM is launched. pages (str, int, `iterable` of `int`, optional): An optional values specifying pages to extract from. It allows `str`,`int`, `iterable` of :`int`. Default: `1`