Skip to content

Commit

Permalink
More fixes to webpages and LaTex (#2891)
Browse files Browse the repository at this point in the history
  • Loading branch information
JosselinSomervilleRoberts committed Aug 9, 2024
1 parent 3052092 commit 6313afb
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re

from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
from helm.common.hierarchical_logger import hlog

try:
from latex import build_pdf
Expand Down Expand Up @@ -220,25 +221,21 @@ def handle_latex_error(
# Error format: "LaTeX Error: Environment <env> undefined."
undefined_search = re.search(r"LaTeX Error: Environment (.*) undefined", str_e)
if undefined_search:
# If a package is missing and this is our first retry, then simply include TEX_INCLUDES
if num_try_remaining == MAX_NUM_TRIES:
fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
if num_try_remaining < MAX_NUM_TRIES or fixed_code == original_latex_code:
# Here we try to manually solve the missing environment.
# This is either executed on the second rety or the first if no changements
# were made in the first retry.
assert TEX_INCLUDES in fixed_code, "TEX_INCLUDES should be present in the code"
# TEX_INCLUDES is already present, so we add the missing package
# Since we cannot know the name of the package that contains the missing environment,
# we simply hope that they are named the same way.
env_undefined: str = undefined_search.group(1)

if f"\\usepackage{{{env_undefined}}}" in fixed_code:
# We already tried to include the missing package, but it probably
# does not exist, so we raise an error
raise RuntimeError(str(e)) from e

fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
# Here we try to manually solve the missing environment.
# This is either executed on the second rety or the first if no changements
# were made in the first retry.
assert TEX_INCLUDES in fixed_code, f"TEX_INCLUDES should be present in the code. code={fixed_code}"
# TEX_INCLUDES is already present, so we add the missing package
# Since we cannot know the name of the package that contains the missing environment,
# we simply hope that they are named the same way.
env_undefined: str = undefined_search.group(1)

if f"\\usepackage{{{env_undefined}}}" in fixed_code:
# We already tried to include the missing package, but it probably
# does not exist, so we raise an error
raise RuntimeError(str(e)) from e

fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")

# Try again with the fixed code (if the fixed code is different from the original code)
if fixed_code != original_latex_code:
Expand Down Expand Up @@ -310,20 +307,21 @@ def latex_to_image(
documentclass_search = re.search(r"\\documentclass(\[.*?\])?\{.*?\}", original_latex_code)
documentstyle_search = re.search(r"\\documentstyle(\[.*?\])?\{.*?\}", original_latex_code)
if documentclass_search:
documentclass: str = documentclass_search.group(1)
original_latex_code = original_latex_code.replace(f"\\documentclass{{{documentclass}}}", TEX_BEGIN_FILE)
matching_string = documentclass_search.group()
original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
elif documentstyle_search:
documentstyle: str = documentstyle_search.group(1)
original_latex_code = original_latex_code.replace(f"\\documentstyle{{{documentstyle}}}", TEX_BEGIN_FILE)
matching_string = documentstyle_search.group()
original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
else:
# If there is no \documentclass, we add our own
original_latex_code = TEX_BEGIN_FILE + "\n\n" + original_latex_code

# 2.2. Add includes. In this ste we remove lal includes for the default ones.
# 2.2. Add includes. In this ste we remove all includes for the default ones.
original_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*\}", "", original_latex_code)
original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")

latex_code: str = original_latex_code
hlog(f"Compiling LaTeX code:\n{latex_code}")
try:
pdf_stream = latex_to_pdf(latex_code, assets_path=assets_path)
image = pdf_to_image(pdf_stream, crop=crop, resize_to=resize_to)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Dict, List, Any, Optional

from helm.benchmark.annotation.image2struct.image_compiler_annotator import CompilationError
from helm.benchmark.scenarios.scenario import VALID_SPLIT
from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import (
Image2StructureScenario,
Expand All @@ -14,6 +15,7 @@
from helm.benchmark.scenarios.vision_language.image2struct.webpage.utils import convert_html_to_text
from helm.common.general import ensure_directory_exists
from helm.common.optional_dependencies import handle_module_not_found_error
from helm.common.hierarchical_logger import hlog

try:
from html2text import HTML2Text
Expand Down Expand Up @@ -73,31 +75,48 @@ def serve_and_take_screenshot(
if not success:
# This runs on examples that are not expected to fail
server.stop()
hlog(f"Failed to start the Jekyll server: {repo_path} on port {port}. Will raise a ValueError.")
raise ValueError(f"Jekyll server failed to start: {repo_path}")

# Take a screenshot of a random page
success = False
error: Optional[Exception] = None

for _ in range(max_tries):
MAX_TRIES_ALL_ERRORS = 3
MAX_TRIES_CONNECTION_REFUSED = 5
MAX_TRIES = max(MAX_TRIES_ALL_ERRORS, MAX_TRIES_CONNECTION_REFUSED)
for compilation_attempt in range(MAX_TRIES):
try:
infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
success = True
break
except Exception as e:
error = e

if "net::ERR_CONNECTION_REFUSED" in str(e):
if "net::ERR_CONNECTION_REFUSED" in str(e) and compilation_attempt < MAX_TRIES_CONNECTION_REFUSED:
hlog(
f"Failed to take a screenshot: ERR_CONNECTION_REFUSED [Attempt {compilation_attempt + 1}/"
f"{MAX_TRIES_CONNECTION_REFUSED}]. Error: {e}. Retrying..."
)
server.stop()
time.sleep(0.5)
server.start()
time.sleep(0.5)
elif compilation_attempt < MAX_TRIES_ALL_ERRORS:
hlog(
f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
f" Error: {e}. Retrying..."
)
else:
# Do not retry
hlog(
f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
f" Error: {e}. Raising CompilationError."
)
break

if not success:
raise ValueError(f"Failed to take a screenshot: {error}")
raise CompilationError(f"Failed to take a screenshot: {error}")

# Stop the server
server.stop()
Expand Down

0 comments on commit 6313afb

Please sign in to comment.