Skip to content

Commit

Permalink
Merge pull request #15 from mdavis-xyz/13-speedup
Browse files Browse the repository at this point in the history
Speed up the code
  • Loading branch information
prakaa committed Jun 5, 2024
2 parents bbcedc2 + 70831fe commit 8e4fb70
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 44 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/prakaa/mms-monthly-cli/master.svg)](https://results.pre-commit.ci/latest/github/prakaa/mms-monthly-cli/master)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)

A CLI utility to find and obtain data made available through AEMO's [MMS Monthly Data Archive](http://www.nemweb.com.au/Data_Archive/Wholesale_Electricity/MMSDM/).
A CLI utility to find and obtain data made available through AEMO's [MMS Monthly Data Archive](https://www.nemweb.com.au/Data_Archive/Wholesale_Electricity/MMSDM/).

> **Note**
>
Expand Down
2 changes: 1 addition & 1 deletion mms_monthly_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
help=(
"A CLI utility to find and obtain data made available through "
+ "AEMO's MMS Monthly Data Archive: "
+ "http://www.nemweb.com.au/Data_Archive/Wholesale_Electricity/MMSDM/"
+ "https://www.nemweb.com.au/Data_Archive/Wholesale_Electricity/MMSDM/"
),
)

Expand Down
76 changes: 34 additions & 42 deletions mms_monthly_cli/mms_monthly.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@

import logging
import shutil
from functools import cache
from pathlib import Path
from re import match
from time import sleep
from typing import Dict, List, Union
from zipfile import BadZipFile, ZipFile

Expand All @@ -30,54 +32,38 @@

# Data

MMSDM_ARCHIVE_URL = "http://www.nemweb.com.au/Data_Archive/Wholesale_Electricity/MMSDM/"
MMSDM_ARCHIVE_URL = (
"https://www.nemweb.com.au/Data_Archive/Wholesale_Electricity/MMSDM/"
)
"""Wholesale electricity data archive base URL"""

# Decorator function to

# Functions to handle requests and scraped soup


def _build_nemweb_get_header(useragent: str) -> Dict[str, str]:
"""Builds request header for GET requests from NEMWeb
Args:
useragent: User-Agent string to use
Returns:
Dict that can be used as a request header
"""
header = {
"Host": "www.nemweb.com.au",
"User-Agent": useragent,
# requests session, to re-use TLS and HTTP connection across requests
# for speed improvement
_session = requests.Session()
_session.headers.update(
{
"User-Agent": generate_user_agent(),
"Accept": (
"text/html,application/xhtml+xml,application/xml;"
+ "q=0.9,image/avif,image/webp,*/*;q=0.8"
),
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
return header
)

# Functions to handle requests and scraped soup


def _request_content(
url: str, useragent: str, additional_header: Dict = {}
) -> requests.Response:
"""Initiates a GET request with header information.
def _request_content(url: str, additional_header: Dict = {}) -> requests.Response:
"""Initiates a GET request.
Args:
url: URL for GET request.
useragent: User-Agent to use in header.
additional_header: Empty dictionary as default. Can be used to add
additional header information to GET request.
Returns:
requests Response object.
"""
header = _build_nemweb_get_header(useragent)
if additional_header:
header.update(additional_header)
r = requests.get(url, headers=header)
r = _session.get(url, headers=additional_header)
return r


Expand All @@ -86,23 +72,28 @@ def _rerequest_to_obtain_soup(url: str, additional_header: Dict = {}) -> Beautif
Args:
url: URL for GET request.
useragent: User-Agent to use in header.
additional_header: Empty dictionary as default. Can be used to add
additional header information to GET request.
Returns:
BeautifulSoup object with parsed HTML.
"""
useragent = generate_user_agent()
r = _request_content(url, useragent, additional_header=additional_header)
r = _request_content(url, additional_header)

# retry configuration
initial_wait = 0.1
max_wait = 10
backoff = 2
wait = initial_wait

while (ok := r.status_code == requests.status_codes.codes["OK"]) < 1:
r = _request_content(url, useragent, additional_header=additional_header)
r = _request_content(url, additional_header)
if r.status_code == requests.status_codes.codes["OK"]:
ok += 1
else:
logging.info("Relaunching request")
useragent = generate_user_agent()
sleep(wait)
wait = min(wait * backoff, max_wait)

soup = BeautifulSoup(r.content, "html.parser")
return soup

Expand Down Expand Up @@ -197,7 +188,7 @@ def _get_filesize(url: str) -> int:
Returns:
File size in bytes
"""
h = requests.head(url, headers=_build_nemweb_get_header(generate_user_agent()))
h = _session.head(url)
total_length = int(h.headers.get("Content-Length", 0))
return total_length

Expand Down Expand Up @@ -250,6 +241,7 @@ def _validate_data_dir(year: int, month: int, data_dir: str) -> None:
# Main functions to find available data, or to obtain data


@cache
def get_years_and_months() -> Dict[int, List[int]]:
"""Years and months with data on NEMWeb MMSDM Historical Data Archive
Returns:
Expand All @@ -261,7 +253,6 @@ def _get_months(url: str) -> List[int]:
Args:
url: url for GET request.
header: useragent to pass to GET request.
Returns:
List of unique months (as integers).
"""
Expand Down Expand Up @@ -294,6 +285,7 @@ def _get_months(url: str) -> List[int]:
return yearmonths


@cache
def get_available_tables(year: int, month: int, data_dir: str) -> List[str]:
"""Tables that can be requested from MMSDM Historical Data Archive for a
particular month and year.
Expand All @@ -311,6 +303,7 @@ def get_available_tables(year: int, month: int, data_dir: str) -> List[str]:
return sorted(names)


@cache
def get_table_names_and_sizes(year: int, month: int, data_dir: str) -> Dict:
"""Returns table names and sizes from MMSDM Historical Data Archive page
Expand Down Expand Up @@ -366,11 +359,10 @@ def get_and_unzip_table_csv(
raise ValueError(f"Table not in available tables for {month}/{year}")
if not (cache_path := Path(cache)).exists():
cache_path.mkdir(parents=True)
header = _build_nemweb_get_header(generate_user_agent())
url = _construct_table_url(year, month, data_dir, table)
file_name = Path(url).name
file_path = cache / Path(file_name)
with requests.get(url, headers=header, stream=True) as resp:
with _session.get(url, stream=True) as resp:
total_length = int(resp.headers.get("Content-Length", 0))
resp.raise_for_status()
with tqdm.wrapattr(resp.raw, "read", desc=file_name, total=total_length) as raw:
Expand Down

0 comments on commit 8e4fb70

Please sign in to comment.