sizer data imports (#359)

* added doc to website Fixes #357 * testing that cpc load still works * updated the exmples * updated example
uncscode · Nov 13, 2023 · 82b77e7 · 82b77e7
1 parent 21d6d2b
commit 82b77e7
Show file tree

Hide file tree

Showing 7 changed files with 957 additions and 96 deletions.
diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -13,6 +13,7 @@ parts:
   - file: examples/distribution_evolution
   - file: examples/distribution_ambient
   - file: examples/ionparticle_coagulation
+  - file: examples/loading_data_part1
 - caption: Documentation
   numbered: false
   chapters:

diff --git a/docs/examples/loading_data_part1.ipynb b/docs/examples/loading_data_part1.ipynb
@@ -4,11 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    " # Loading Data from a File\n",
+    " # Loading Data from a File Part 1\n",
     "\n",
-    " This example shows how to load data from a file into a Stream object. These\n",
-    " are usufull for doing some automated analysis, but you can just pull\n",
-    " data from a file and do whatever you want with it."
+    " This example shows how to load data from a file and automate the cleaning, \n",
+    " formatting, and processing of the data.\n",
+    "\n",
+    " If you have a lot of data and repetitive tasks, you can use the scripts at\n",
+    " the end of this example to clean up you import process."
    ]
   },
   {
@@ -40,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -308,6 +310,7 @@
       "filename_regex: *.csv\n",
       "MIN_SIZE_BYTES: 10\n",
       "data_loading_function: general_1d_load\n",
+      "header_row: 0\n",
       "data_checks: {'characters': [10, 100], 'char_counts': {',': 4}, 'skip_rows': 0, 'skip_end': 0}\n",
       "data_column: [1, 2]\n",
       "data_header: ['data 1', 'data 2']\n",
@@ -364,7 +367,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -389,6 +392,7 @@
     "    'filename_regex': '*.csv',\n",
     "    'MIN_SIZE_BYTES': 10,\n",
     "    'data_loading_function':'general_1d_load',\n",
+    "    'header_row': 0,\n",
     "    'data_checks': {'characters': [\n",
     "            10,\n",
     "            100],\n",
@@ -417,7 +421,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -441,7 +445,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -465,12 +469,24 @@
     "        linestyle=\"none\",\n",
     "        marker=\".\",)\n",
     "plt.tick_params(rotation=-35)\n",
-    "ax.set_xlabel(\"Time (epoch)\")\n",
+    "ax.set_xlabel(\"Time (UTC)\")\n",
     "ax.set_ylabel(\"Data\")\n",
     "ax.legend()\n",
     "plt.show()\n",
     "fig.tight_layout()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "We covered how to load data from a file and automate the cleaning, formatting,\n",
+    "and processing of the data. We then showed how to generate a settings\n",
+    "dictionary and use that to load the data into a `Stream` object. This is\n",
+    "useful if you have a lot of data and repetitive tasks. Doing this method also loads and combines multiple files into one `Stream` object.\n"
+   ]
   }
  ],
  "metadata": {

diff --git a/docs/examples/loading_data_part2.ipynb b/docs/examples/loading_data_part2.ipynb
diff --git a/particula/data/loader.py b/particula/data/loader.py
@@ -351,6 +351,7 @@ def general_data_formatter(
     time_column: Union[int, List[int]],
     time_format: str,
     delimiter: str = ',',
+    header_row: int = 0,
     date_offset: str = None,
     seconds_shift: int = 0,
     timezone_identifier: str = 'UTC'
@@ -383,6 +384,14 @@ def general_data_formatter(
         A tuple containing two np.array objects: the first contains the
         epoch times, and the second contains the data.
     """
+
+    # find str matching in header row and gets index
+    if isinstance(data_column[0], str):
+        data_header = data[header_row].split(delimiter)
+        # Get data column indices
+        data_column = [data_header.index(x)
+                       for x in data_column]
+
     # Check the data format
     data = data_format_checks(data, data_checks)
 
@@ -408,6 +417,7 @@ def sizer_data_formatter(
             time_column: int,
             time_format: str,
             delimiter: str = ',',
+            header_row: int = 0,
             date_offset: str = None,
             seconds_shift: int = 0,
             timezone_identifier: str = 'UTC'
@@ -443,25 +453,33 @@ def sizer_data_formatter(
     """
 
     # Get Dp range and columns
-    data_header = data[data_sizer_reader["header_rows"]].split(delimiter)
+    data_header = data[header_row].split(delimiter)
+    # check if start and end keywords are in the header
+    if data_sizer_reader["Dp_start_keyword"] not in data_header:
+        # rise error with snip of data header
+        raise ValueError(
+            f"Cannot find '{data_sizer_reader['Dp_start_keyword']}' in header"\
+            + f" {data_header[:20]}..."
+        )
+    if data_sizer_reader["Dp_end_keyword"] not in data_header:
+        # rise error with snip of data header
+        raise ValueError(
+            f"Cannot find '{data_sizer_reader['Dp_end_keyword']}' in header"\
+            + f" {data_header[:20]}..."
+        )
     dp_range = [
                 data_header.index(data_sizer_reader["Dp_start_keyword"]),
                 data_header.index(data_sizer_reader["Dp_end_keyword"])
                 ]
     dp_columns = list(range(dp_range[0]+1, dp_range[1]))
-    dp_header = [data_header[i] for i in dp_columns]
+    header = [data_header[i] for i in dp_columns]
     # change from np.array
 
-    # Get data columns
-    data_column = [
-        data_header.index(x) for x in data_sizer_reader["list_of_data_headers"]
-        ]
-
     # Format data
     data = data_format_checks(data, data_checks)
 
     # Get data arrays
-    epoch_time, data_smps_2d = sample_data(
+    epoch_time, data_2d = sample_data(
         data,
         time_column,
         time_format,
@@ -471,16 +489,6 @@ def sizer_data_formatter(
         seconds_shift=seconds_shift,
         timezone_identifier=timezone_identifier
     )
-    epoch_time, data_smps_1d = sample_data(
-        data,
-        time_column,
-        time_format,
-        data_column,
-        delimiter,
-        date_offset,
-        seconds_shift=seconds_shift,
-        timezone_identifier=timezone_identifier
-    )
 
     if "convert_scale_from" in data_sizer_reader:
         if data_sizer_reader["convert_scale_from"] == "dw":
@@ -493,13 +501,13 @@ def sizer_data_formatter(
                 " Either dw/dlogdp or dw must be specified."
             )
         for i in range(len(epoch_time)):
-            data_smps_2d[i, :] = convert.convert_sizer_dn(
-                diameter=np.array(dp_header).astype(float),
-                dn_dlogdp=data_smps_2d[i, :],
+            data_2d[i, :] = convert.convert_sizer_dn(
+                diameter=np.array(header).astype(float),
+                dn_dlogdp=data_2d[i, :],
                 inverse=inverse
             )
 
-    return epoch_time, dp_header, data_smps_2d, data_smps_1d
+    return epoch_time, data_2d, header
 
 
 def non_standard_date_location(