Skip to content

Commit

Permalink
sizer data imports (#359)
Browse files Browse the repository at this point in the history
* added doc to website
Fixes #357

* testing that cpc load still works

* updated the exmples

* updated example
  • Loading branch information
Gorkowski committed Nov 13, 2023
1 parent 21d6d2b commit 82b77e7
Show file tree
Hide file tree
Showing 7 changed files with 957 additions and 96 deletions.
1 change: 1 addition & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ parts:
- file: examples/distribution_evolution
- file: examples/distribution_ambient
- file: examples/ionparticle_coagulation
- file: examples/loading_data_part1
- caption: Documentation
numbered: false
chapters:
Expand Down
34 changes: 25 additions & 9 deletions docs/examples/loading_data_part1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
"cell_type": "markdown",
"metadata": {},
"source": [
" # Loading Data from a File\n",
" # Loading Data from a File Part 1\n",
"\n",
" This example shows how to load data from a file into a Stream object. These\n",
" are usufull for doing some automated analysis, but you can just pull\n",
" data from a file and do whatever you want with it."
" This example shows how to load data from a file and automate the cleaning, \n",
" formatting, and processing of the data.\n",
"\n",
" If you have a lot of data and repetitive tasks, you can use the scripts at\n",
" the end of this example to clean up you import process."
]
},
{
Expand Down Expand Up @@ -40,7 +42,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -308,6 +310,7 @@
"filename_regex: *.csv\n",
"MIN_SIZE_BYTES: 10\n",
"data_loading_function: general_1d_load\n",
"header_row: 0\n",
"data_checks: {'characters': [10, 100], 'char_counts': {',': 4}, 'skip_rows': 0, 'skip_end': 0}\n",
"data_column: [1, 2]\n",
"data_header: ['data 1', 'data 2']\n",
Expand Down Expand Up @@ -364,7 +367,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand All @@ -389,6 +392,7 @@
" 'filename_regex': '*.csv',\n",
" 'MIN_SIZE_BYTES': 10,\n",
" 'data_loading_function':'general_1d_load',\n",
" 'header_row': 0,\n",
" 'data_checks': {'characters': [\n",
" 10,\n",
" 100],\n",
Expand Down Expand Up @@ -417,7 +421,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand All @@ -441,7 +445,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand All @@ -465,12 +469,24 @@
" linestyle=\"none\",\n",
" marker=\".\",)\n",
"plt.tick_params(rotation=-35)\n",
"ax.set_xlabel(\"Time (epoch)\")\n",
"ax.set_xlabel(\"Time (UTC)\")\n",
"ax.set_ylabel(\"Data\")\n",
"ax.legend()\n",
"plt.show()\n",
"fig.tight_layout()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary\n",
"\n",
"We covered how to load data from a file and automate the cleaning, formatting,\n",
"and processing of the data. We then showed how to generate a settings\n",
"dictionary and use that to load the data into a `Stream` object. This is\n",
"useful if you have a lot of data and repetitive tasks. Doing this method also loads and combines multiple files into one `Stream` object.\n"
]
}
],
"metadata": {
Expand Down
714 changes: 714 additions & 0 deletions docs/examples/loading_data_part2.ipynb

Large diffs are not rendered by default.

52 changes: 30 additions & 22 deletions particula/data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ def general_data_formatter(
time_column: Union[int, List[int]],
time_format: str,
delimiter: str = ',',
header_row: int = 0,
date_offset: str = None,
seconds_shift: int = 0,
timezone_identifier: str = 'UTC'
Expand Down Expand Up @@ -383,6 +384,14 @@ def general_data_formatter(
A tuple containing two np.array objects: the first contains the
epoch times, and the second contains the data.
"""

# find str matching in header row and gets index
if isinstance(data_column[0], str):
data_header = data[header_row].split(delimiter)
# Get data column indices
data_column = [data_header.index(x)
for x in data_column]

# Check the data format
data = data_format_checks(data, data_checks)

Expand All @@ -408,6 +417,7 @@ def sizer_data_formatter(
time_column: int,
time_format: str,
delimiter: str = ',',
header_row: int = 0,
date_offset: str = None,
seconds_shift: int = 0,
timezone_identifier: str = 'UTC'
Expand Down Expand Up @@ -443,25 +453,33 @@ def sizer_data_formatter(
"""

# Get Dp range and columns
data_header = data[data_sizer_reader["header_rows"]].split(delimiter)
data_header = data[header_row].split(delimiter)
# check if start and end keywords are in the header
if data_sizer_reader["Dp_start_keyword"] not in data_header:
# rise error with snip of data header
raise ValueError(
f"Cannot find '{data_sizer_reader['Dp_start_keyword']}' in header"\
+ f" {data_header[:20]}..."
)
if data_sizer_reader["Dp_end_keyword"] not in data_header:
# rise error with snip of data header
raise ValueError(
f"Cannot find '{data_sizer_reader['Dp_end_keyword']}' in header"\
+ f" {data_header[:20]}..."
)
dp_range = [
data_header.index(data_sizer_reader["Dp_start_keyword"]),
data_header.index(data_sizer_reader["Dp_end_keyword"])
]
dp_columns = list(range(dp_range[0]+1, dp_range[1]))
dp_header = [data_header[i] for i in dp_columns]
header = [data_header[i] for i in dp_columns]
# change from np.array

# Get data columns
data_column = [
data_header.index(x) for x in data_sizer_reader["list_of_data_headers"]
]

# Format data
data = data_format_checks(data, data_checks)

# Get data arrays
epoch_time, data_smps_2d = sample_data(
epoch_time, data_2d = sample_data(
data,
time_column,
time_format,
Expand All @@ -471,16 +489,6 @@ def sizer_data_formatter(
seconds_shift=seconds_shift,
timezone_identifier=timezone_identifier
)
epoch_time, data_smps_1d = sample_data(
data,
time_column,
time_format,
data_column,
delimiter,
date_offset,
seconds_shift=seconds_shift,
timezone_identifier=timezone_identifier
)

if "convert_scale_from" in data_sizer_reader:
if data_sizer_reader["convert_scale_from"] == "dw":
Expand All @@ -493,13 +501,13 @@ def sizer_data_formatter(
" Either dw/dlogdp or dw must be specified."
)
for i in range(len(epoch_time)):
data_smps_2d[i, :] = convert.convert_sizer_dn(
diameter=np.array(dp_header).astype(float),
dn_dlogdp=data_smps_2d[i, :],
data_2d[i, :] = convert.convert_sizer_dn(
diameter=np.array(header).astype(float),
dn_dlogdp=data_2d[i, :],
inverse=inverse
)

return epoch_time, dp_header, data_smps_2d, data_smps_1d
return epoch_time, data_2d, header


def non_standard_date_location(
Expand Down
Loading

0 comments on commit 82b77e7

Please sign in to comment.