diff --git a/docs/_toc.yml b/docs/_toc.yml index 111d14f2e..14e3230a7 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -13,6 +13,7 @@ parts: - file: examples/distribution_evolution - file: examples/distribution_ambient - file: examples/ionparticle_coagulation + - file: examples/loading_data_part1 - caption: Documentation numbered: false chapters: diff --git a/docs/examples/loading_data_part1.ipynb b/docs/examples/loading_data_part1.ipynb index 3b571e987..7886615bd 100644 --- a/docs/examples/loading_data_part1.ipynb +++ b/docs/examples/loading_data_part1.ipynb @@ -4,11 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " # Loading Data from a File\n", + " # Loading Data from a File Part 1\n", "\n", - " This example shows how to load data from a file into a Stream object. These\n", - " are usufull for doing some automated analysis, but you can just pull\n", - " data from a file and do whatever you want with it." + " This example shows how to load data from a file and automate the cleaning, \n", + " formatting, and processing of the data.\n", + "\n", + " If you have a lot of data and repetitive tasks, you can use the scripts at\n", + " the end of this example to clean up you import process." ] }, { @@ -40,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -308,6 +310,7 @@ "filename_regex: *.csv\n", "MIN_SIZE_BYTES: 10\n", "data_loading_function: general_1d_load\n", + "header_row: 0\n", "data_checks: {'characters': [10, 100], 'char_counts': {',': 4}, 'skip_rows': 0, 'skip_end': 0}\n", "data_column: [1, 2]\n", "data_header: ['data 1', 'data 2']\n", @@ -364,7 +367,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -389,6 +392,7 @@ " 'filename_regex': '*.csv',\n", " 'MIN_SIZE_BYTES': 10,\n", " 'data_loading_function':'general_1d_load',\n", + " 'header_row': 0,\n", " 'data_checks': {'characters': [\n", " 10,\n", " 100],\n", @@ -417,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -441,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -465,12 +469,24 @@ " linestyle=\"none\",\n", " marker=\".\",)\n", "plt.tick_params(rotation=-35)\n", - "ax.set_xlabel(\"Time (epoch)\")\n", + "ax.set_xlabel(\"Time (UTC)\")\n", "ax.set_ylabel(\"Data\")\n", "ax.legend()\n", "plt.show()\n", "fig.tight_layout()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "We covered how to load data from a file and automate the cleaning, formatting,\n", + "and processing of the data. We then showed how to generate a settings\n", + "dictionary and use that to load the data into a `Stream` object. This is\n", + "useful if you have a lot of data and repetitive tasks. Doing this method also loads and combines multiple files into one `Stream` object.\n" + ] } ], "metadata": { diff --git a/docs/examples/loading_data_part2.ipynb b/docs/examples/loading_data_part2.ipynb new file mode 100644 index 000000000..33ce0791e --- /dev/null +++ b/docs/examples/loading_data_part2.ipynb @@ -0,0 +1,714 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " # Loading Data from a File Part 2\n", + "\n", + " This example continues from the previous example, so if you haven't already\n", + " done so, please go through the previous example first.\n", + "\n", + " This example covers data in 2 dimensions, such as a size distributions.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Working path\n", + "\n", + " Set the working path where the data is stored. For now we'll use the\n", + " provided example data in this current directory.\n", + "\n", + " But the path could be any where on your computer. For example, if you have a\n", + " folder called \"data\" in your home directory, you could set the path to:\n", + " `path = \"U:\\\\data\\\\processing\\\\Campgain2023_of_aswsome\\\\data\"`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# all the imports, but we'll go through them one by one as we use them\n", + "import os\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from particula.data import loader, loader_interface, settings_generator\n", + "from particula.data.tests.example_data.get_example_data import get_data_folder" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current path for this script:\n", + "\\particula\\docs\\examples\n", + "Path to data folder:\n", + "\\particula\\particula\\data\\tests\\example_data\n" + ] + } + ], + "source": [ + "# set the parent directory of the data folder, for now this is the same as the\n", + "# current working directory, but this can be a completely different path\n", + "#\n", + "# imports os to get the current working directory\n", + "import os\n", + "from particula.data.tests.example_data.get_example_data import get_data_folder\n", + "\n", + "current_path = os.getcwd()\n", + "print('Current path for this script:')\n", + "# print the path from particula/ onwards\n", + "print(current_path.split('GitHub')[1])\n", + "\n", + "path = get_data_folder()\n", + "print('Path to data folder:')\n", + "print(path.split('GitHub')[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " # Load the data\n", + "\n", + " With the working directory set, we can now load the data. For this we use\n", + " the `loader` module and call loader.data_raw_loader() with the file path as\n", + " argument." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Units,dW/dlogDp\n", + "Weight,Number\n", + "Sample #,Date,Start Time,Sample Temp (C),Sample Pressure (kPa),Relative Humidity (%),Mean Free Path (m),Gas Viscosity (Pa*s),Diameter Midpoint (nm),20.72,21.10,21.48,21.87,22.27,22.67,23.08,23.50,23.93,24.36,24.80,25.25,25.71,26.18,26.66,27.14,27.63,28.13,28.64,29.16,29.69,30.23,30.78,31.34,31.91,32.49,33.08,33.68,34.29,34.91,35.55,36.19,36.85,37.52,38.20,38.89,39.60,40.32,41.05,41.79,42.55,43.32,44.11,44.91,45.73,46.56,47.40,48.26,49.14,50.03,50.94,51.86,52.80,53.76,54.74,55.73,56.74,57.77,58.82,59.89,60.98,62.08,63.21,64.36,65.52,66.71,67.93,69.16,70.41,71.69,72.99,74.32,75.67,77.04,78.44,79.86,81.31,82.79,84.29,85.82,87.38,88.96,90.58,92.22,93.90,95.60,97.34,99.10,100.90,102.74,104.60,106.50,108.43,110.40,112.40,114.44,116.52,118.64,120.79,122.98,125.21,127.49,129.80,132.16,134.56,137.00,139.49,142.02,144.60,147.22,149.89,152.61,155.38,158.20,161.08,164.00,166.98,170.01,173.09,176.24,179.43,182.69,186.01,189.38,192.82,196.32,199.89,203.51,207.21,210.97,214.80,218.70,222.67,226.71,230.82,235.01,239.28,243.62,248.05,252.55,257.13,261.80,266.55,271.39,276.32,281.33,286.44,291.64,296.93,302.32,307.81,313.40,319.08,324.88,330.77,336.78,342.89,349.12,355.45,361.90,368.47,375.16,381.97,388.91,395.96,403.15,410.47,417.92,425.51,433.23,441.09,449.10,457.25,465.55,474.00,482.61,491.37,500.29,509.37,518.61,528.03,537.61,547.37,557.31,567.42,577.72,588.21,598.89,609.76,620.82,632.09,643.57,655.25,667.14,679.25,691.58,704.14,716.92,729.93,743.18,756.67,770.40,784.39,Scan Time (s),Retrace Time (s),Scan Resolution (Hz),Scans Per Sample,HV Polarity,Sheath Flow (L/min),Aerosol Flow (L/min),Bypass Flow (L/min),Low Voltage (V),High Voltage (V),Lower Size (nm),Upper Size (nm),Density (g/cm³),td + 0.5 (s),tf (s),D50 (nm),Median (nm),Mean (nm),Geo. Mean (nm),Mode (nm),Geo. Std. Dev.,Total Conc. (#/cm³),Neutralizer Status,Dilution Factor,Test Name,Test Description,Dataset Name,Dataset Description,Instrument Errors\n", + "1,07/07/2022,08:49:17,23.7,101.2,61.9,6.75690e-8,1.83579e-5,,6103.186,2832.655,4733.553,4765.944,5960.964,4475.806,4412.044,5853.069,4832.167,3781.343,3675.830,3271.549,3084.392,3668.269,4116.143,3310.157,3978.368,4151.566,2515.995,3755.837,2776.663,5032.745,3775.426,2818.553,2641.302,2636.806,3079.759,2606.094,2317.234,3192.346,2226.703,2484.878,3394.395,1762.834,3172.359,2919.533,2452.013,3403.780,2360.277,2543.386,2563.290,2649.769,1375.374,1364.046,1446.529,2068.167,1336.070,1542.077,1707.249,1482.481,2272.182,1754.409,2472.438,1191.563,2221.825,1635.293,2548.571,1991.926,2546.956,1790.114,2115.075,1138.769,1934.746,2163.955,1613.179,2132.750,1654.348,1698.154,2403.529,1222.983,1829.254,1197.162,1638.797,1248.565,2417.521,1130.421,1429.423,1694.923,1658.378,1443.393,1731.346,1277.799,1089.149,1072.630,1205.387,1693.146,1109.648,915.428,491.529,881.028,1218.297,755.658,714.301,686.247,790.943,398.805,1043.226,1298.495,1548.704,1070.899,846.596,938.241,232.947,926.941,837.452,794.492,254.455,392.637,353.144,872.576,693.986,1544.164,657.340,546.445,311.890,365.934,616.794,610.810,938.786,815.964,593.441,939.634,188.115,1077.429,1213.142,737.913,1876.626,735.779,996.521,1098.601,1166.494,962.551,1392.535,947.504,655.459,993.819,682.087,852.503,601.057,733.860,529.122,960.578,687.512,839.973,652.820,289.921,623.835,453.604,588.057,856.253,283.994,282.839,365.801,200.382,365.756,146.548,306.730,373.162,114.272,0.000,182.692,260.788,164.857,19.851,89.612,0.000,181.974,0.000,53.276,20.016,0.000,0.000,95.433,96.222,0.000,0.000,197.307,0.000,100.336,0.000,102.072,0.000,0.000,209.529,0.000,213.227,107.561,0.000,218.965,220.930,111.453,0.000,113.460,0.000,115.513,0.000,0.000,0.000,0.000,28.221,93.413,122.992,0.000,75,4,50,1,Negative,2.000,0.300,0.00,10.07,9863.01,20.5,791.5,1.0,1.81,10.79,1000.0,41.562,74.959,52.078,20.721,2.179,2.16900e+3,ON,1,TRACER-CAT,,2022 07 07 09_51,,Detector aerosol flow rate error;Incomplete Scan\n", + "2,07/07/2022,08:50:48,23.6,101.2,61.7,6.75401e-8,1.83531e-5,,5621.118,5867.747,6233.403,3453.156,4484.307,5468.148,4725.052,4689.983,3661.759,4356.725,4292.911,7728.414,5112.679,4746.084,3957.005,3472.977,3496.697,4674.202,4188.868,2868.559,3375.113,4306.112,5191.077,4732.512,4566.029,3514.167,5172.877,3825.270,5323.756,2327.737,3846.602,2347.097,3182.011,1876.273,2952.863,2831.255,2497.869,4158.061,3828.510,3199.720,2309.195,2462.550,3060.240,1086.744,1476.289,2069.774,1727.787,2710.631,2067.327,2619.082,2345.026,2362.235,1429.749,2557.408,2660.327,1209.933,1590.320,1696.569,2236.773,1499.046,1922.632,1650.213,3147.351,2201.919,1622.954,2198.739,1800.998,1429.621,1426.761,1923.931,1262.939,1745.284,1458.571,1523.548,1920.108,1382.558,2211.525,2571.277,1979.297,1562.697,1741.573,1307.680,967.481,838.919,1502.136,1301.401,1011.619,829.770,973.269,1100.004,1152.808,749.250,1187.900,806.256,111.008,297.062,809.059,1361.412,779.536,535.087,881.522,1307.518,800.804,1053.953,182.381,1042.830,673.021,646.171,825.612,963.187,748.743,540.954,769.157,788.222,825.566,236.537,865.009,289.185,803.098,398.510,446.847,439.645,1118.961,1003.003,924.180,745.149,430.134,415.522,805.970,790.348,998.975,1043.136,604.082,1004.545,1082.455,1312.781,1447.390,872.420,398.380,695.719,857.412,645.872,691.129,623.007,471.728,641.049,1023.693,394.611,475.599,446.076,657.686,313.003,136.395,248.550,579.894,336.126,485.938,298.810,0.000,227.571,104.550,157.583,289.697,0.229,0.000,0.000,217.592,67.816,24.067,0.000,0.000,0.000,0.000,0.000,97.009,0.000,0.000,0.000,0.000,0.000,0.000,205.900,0.000,104.761,0.000,0.000,0.000,108.509,0.000,110.461,0.000,0.000,0.000,114.471,115.506,116.540,0.000,118.660,0.000,0.000,0.000,0.000,75.377,75,4,50,1,Negative,2.000,0.300,0.00,10.07,9863.01,20.5,791.5,1.0,1.81,10.79,1000.0,39.458,69.080,49.198,25.255,2.101,2.39408e+3,ON,1,TRACER-CAT,,2022 07 07 09_51,,Detector aerosol flow rate error;Incomplete Scan\n", + "3,07/07/2022,08:52:19,23.7,101.2,61.5,6.75690e-8,1.83579e-5,,5165.139,4969.987,4312.386,6939.394,4680.764,3224.473,4999.149,3653.002,4241.532,3928.137,2718.607,3363.947,4863.410,5338.452,4659.515,3430.329,3997.386,4644.421,4943.511,3883.970,3212.310,4445.981,2349.435,3605.419,4366.557,4969.924,4880.573,3186.281,3089.412,2724.537,3195.740,4277.947,4864.436,4263.532,2100.807,1967.634,3283.337,3268.660,3001.917,2781.549,1879.354,1376.083,2051.524,2165.874,2012.210,2923.129,1575.515,1544.252,1610.635,1572.609,1299.370,1549.832,1145.100,2897.864,1839.992,2351.579,2102.027,1543.106,953.811,2073.610,2317.378,2087.617,1586.363,1897.860,2456.722,1647.781,1013.534,1734.023,1633.021,1841.697,2193.442,2714.856,1396.336,2264.046,1671.363,1538.012,1257.148,1423.316,1217.281,1745.437,1787.473,1284.774,1534.815,1274.852,1438.025,1199.602,964.066,862.098,685.995,679.146,879.775,806.703,979.672,894.103,1379.499,1112.031,744.999,580.777,1241.262,960.784,750.484,908.236,957.901,652.265,1200.515,429.487,347.453,552.393,617.871,652.163,709.227,788.963,1499.238,627.895,1315.208,976.800,555.360,440.680,1182.819,863.800,362.530,942.047,460.380,1222.507,678.820,1006.555,319.371,91.941,761.841,205.384,449.120,751.217,572.530,350.734,295.089,413.379,612.088,474.457,678.504,490.408,751.536,400.656,585.567,676.707,364.052,124.385,631.790,788.487,566.062,390.904,141.751,256.369,366.589,528.781,512.078,257.120,393.412,350.601,361.659,65.138,348.203,326.629,329.714,175.810,111.365,74.091,103.212,0.000,0.000,47.532,0.000,166.826,0.000,96.217,388.070,97.832,98.649,99.490,200.678,202.399,0.000,102.953,0.000,0.000,105.683,106.611,33.630,183.108,2.602,218.305,222.901,0.000,226.925,0.000,0.000,116.553,0.000,118.661,119.732,120.801,0.000,122.992,124.085,75,4,50,1,Negative,2.000,0.300,0.00,10.07,9863.01,20.5,791.5,1.0,1.81,10.79,1000.0,39.324,72.102,50.019,21.870,2.136,2.27861e+3,ON,1,TRACER-CAT,,2022 07 07 09_51,,Detector aerosol flow rate error;Incomplete Scan\n", + "4,07/07/2022,08:53:50,23.8,101.2,61.4,6.75979e-8,1.83627e-5,,5814.745,5937.421,5542.118,7127.484,5341.069,4793.690,4938.844,5721.541,4877.746,5900.250,5104.984,4914.366,4891.892,6655.579,4431.173,3389.961,4947.809,3115.245,4138.126,5421.474,4589.063,4007.156,2524.137,5009.064,4780.963,4959.096,3648.285,4148.676,4270.099,2229.465,3043.487,5618.376,3689.188,4700.549,2535.915,1754.223,2560.335,2853.385,2454.711,2515.907,3015.370,1502.864,2344.161,2761.448,2047.076,1542.531,2151.757,2365.884,2330.816,2585.566,1431.955,2391.335,2097.717,1891.014,2211.815,2071.479,2188.302,2475.058,1906.364,1781.793,2356.998,1527.723,2609.446,1644.771,1917.624,1843.984,2418.197,1385.516,1263.621,2155.939,2083.223,1765.167,957.777,2077.747,1667.811,1122.065,1579.113,1709.471,1604.406,686.151,390.075,1194.313,1657.144,1462.232,1870.846,1012.132,847.165,1248.528,1039.604,779.076,1375.101,1058.272,1013.378,1211.420,1641.490,979.146,835.539,763.524,951.720,1270.393,1308.492,1056.486,1715.924,657.112,1475.767,235.866,827.129,1266.089,1080.958,1246.249,1147.116,840.719,1560.246,1201.554,1743.366,1233.526,1166.422,1068.551,1047.492,787.018,759.836,491.419,714.111,460.361,681.068,767.815,654.715,501.038,357.016,575.937,613.281,851.029,583.739,475.691,431.584,616.144,744.932,409.334,984.682,371.750,613.130,757.474,637.077,441.004,609.132,380.961,595.419,565.033,566.955,332.402,450.524,139.761,430.419,443.058,558.628,158.467,271.708,346.807,57.637,148.050,226.825,353.827,77.661,0.000,0.000,74.100,0.000,250.296,117.433,93.156,187.816,0.000,95.443,0.000,0.000,293.505,0.000,99.496,100.342,0.000,102.078,102.959,0.000,0.000,0.000,106.622,322.709,0.000,328.474,0.000,67.473,44.378,0.000,0.000,115.519,0.000,0.000,118.668,0.000,0.000,0.000,0.000,0.000,75,4,50,1,Negative,2.000,0.300,0.00,10.07,9863.01,20.5,791.5,1.0,1.81,10.79,1000.0,37.995,68.796,48.896,21.870,2.107,2.51144e+3,ON,1,TRACER-CAT,,2022 07 07 09_51,,Detector aerosol flow rate error;Incomplete Scan\n", + "5,07/07/2022,08:55:21,24.0,101.1,61.4,6.77227e-8,1.83722e-5,,8034.425,6317.981,6972.600,4577.324,6488.519,4985.397,5484.518,7295.312,3449.590,4261.716,4259.456,6124.670,4418.824,5418.742,3311.293,3548.897,4940.747,6738.536,3377.823,3309.433,5322.339,4148.187,3387.285,3967.636,5064.382,4573.259,3896.245,4006.531,3769.030,4129.946,4678.454,3121.839,3888.625,2443.782,1947.617,2321.130,1845.465,2833.269,2745.881,3262.145,4055.876,2319.187,3397.282,2596.623,2935.256,1508.733,1555.232,3184.200,2683.631,2158.530,2303.663,2739.336,2714.276,2536.377,2051.076,2063.667,2074.972,2852.267,2366.702,2135.668,1500.801,2228.817,2220.527,1501.131,2354.567,2072.434,2547.917,2111.890,1474.809,1561.614,1334.889,1100.318,1077.335,1470.618,1377.825,1684.933,1093.441,1596.409,1456.255,1543.298,1116.499,984.258,1294.805,1586.816,723.664,1709.369,1060.965,1415.310,1611.158,1791.258,1098.238,1513.790,1335.019,1178.572,1538.772,477.803,1130.380,1596.999,652.664,1098.951,1384.104,772.285,788.185,1432.363,773.331,729.470,819.882,979.684,925.309,753.771,706.255,659.741,1026.707,818.647,1205.428,940.460,906.655,758.763,811.344,1123.245,520.356,1009.392,651.265,735.336,209.657,549.624,537.181,841.849,483.705,713.011,497.248,743.196,556.459,953.140,847.692,614.097,423.810,816.193,627.059,453.998,976.898,592.170,548.197,535.480,667.837,312.390,476.781,369.028,451.687,432.520,1001.512,312.053,498.408,198.771,399.968,363.778,403.848,381.782,223.839,227.667,212.819,101.097,164.909,359.326,285.450,0.000,44.177,0.000,158.441,220.559,81.404,49.687,95.468,0.000,194.095,391.452,98.679,0.000,0.000,0.000,0.000,102.990,103.881,104.800,0.000,106.644,0.000,108.554,0.000,110.496,0.000,112.492,113.494,0.000,115.548,0.000,0.000,0.000,239.531,241.683,0.000,0.000,248.252,75,4,50,1,Negative,2.000,0.300,0.00,10.07,9863.01,20.5,791.5,1.0,1.81,10.79,1000.0,39.214,69.960,48.959,20.721,2.123,2.56068e+3,ON,1,TRACER-CAT,,2022 07 07 09_51,,Detector aerosol flow rate error;Incomplete Scan\n" + ] + } + ], + "source": [ + "data_file = os.path.join(\n", + " path,\n", + " 'SMPS_data',\n", + " '2022-07-07_095151_SMPS.csv')\n", + "\n", + "# print the file path\n", + "# print(data_file)\n", + "\n", + "# load the data\n", + "raw_data = loader.data_raw_loader(data_file)\n", + "\n", + "# print the interesting bits\n", + "for row in raw_data[22:30]:\n", + " print(row)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " # Now to format the data\n", + "\n", + " This is a little more complicated than the 1d data, because we have to\n", + " pull out the sizes bins, and read them in as our headers. This is done by\n", + " specifiying the start and end keywords for the size bins. In this case\n", + " the start keyword is \"Date Time\" and the end keyword is \"Total Conc\"." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch time:\n", + "[1.65718376e+09 1.65718385e+09 1.65718394e+09 1.65718403e+09\n", + " 1.65718412e+09]\n", + "Data shape:\n", + "(2854, 203)\n", + "Header:\n", + "['20.72', '21.10', '21.48', '21.87', '22.27', '22.67', '23.08', '23.50', '23.93', '24.36']\n" + ] + } + ], + "source": [ + "# This is done by the general_data_formatter function for timeseries data\n", + "# 2d data is a separate function\n", + "\n", + "epoch_time, data, header = loader.sizer_data_formatter(\n", + " data=raw_data,\n", + " data_checks={\n", + " \"characters\": [250],\n", + " \"skip_rows\": 25,\n", + " \"skip_end\": 0,\n", + " \"char_counts\": {\"/\": 2, \":\": 2}\n", + " },\n", + " data_sizer_reader={\n", + " 'Dp_start_keyword': 'Diameter Midpoint (nm)',\n", + " 'Dp_end_keyword': 'Scan Time (s)',\n", + " 'convert_scale_from': 'dw/dlogdp'\n", + " },\n", + " time_column=[1, 2],\n", + " time_format=\"%m/%d/%Y %H:%M:%S\",\n", + " delimiter=\",\",\n", + " header_row=24)\n", + "\n", + "# print the first bit of the data\n", + "print('Epoch time:')\n", + "print(epoch_time[:5])\n", + "print('Data shape:')\n", + "print(data.shape)\n", + "print('Header:')\n", + "print(header[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Pause to Plot\n", + "\n", + " Now that we have the data and time, we can plot it to see what it looks\n", + " like." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot the data\n", + "fig, ax = plt.subplots()\n", + "ax.plot(epoch_time,\n", + " data[:, 50],\n", + " label=f'Bin {header[50]} nm',\n", + ")\n", + "ax.set_xlabel(\"Time (epoch)\")\n", + "ax.set_ylabel(\"Bin Concentration (#/cm³)\")\n", + "ax.legend()\n", + "plt.show()\n", + "fig.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dates in Plots\n", + "\n", + "If you want dates on the x-axis, you need to convert the dates to\n", + "matplotlib dates, or use np.datetime64. This is done in the `convert.datetime64_from_epoch_array` function.\n", + "\n", + "Then it is usually best to rotate the x-axis labels so they don't overlap.\n", + "This is done with the `plt.xticks(rotation=45)` function." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from particula.util.convert import datetime64_from_epoch_array\n", + "\n", + "# convert the epoch time to datetime64\n", + "time_in_datetime64 = datetime64_from_epoch_array(epoch_time)\n", + "\n", + "# plot the data\n", + "fig, ax = plt.subplots()\n", + "ax.plot(time_in_datetime64,\n", + " data[:, 50],\n", + " label=f'Bin {header[50]} nm',\n", + " )\n", + "plt.xticks(rotation=45)\n", + "ax.set_xlabel(\"Time (epoch)\")\n", + "ax.set_ylabel(\"Bin Concentration (#/cm³)\")\n", + "ax.legend()\n", + "plt.show()\n", + "fig.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Contour plot of data\n", + "\n", + " We can also plot the data as a contour plot, which is useful for seeing\n", + " how the data changes over time." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# settin limits helps to see the data better, you can also use np.log10()\n", + "# to plot the concentration in log space\n", + "concentration = data\n", + "concentration = np.where(concentration < 1e-5, 1e-5, concentration)\n", + "concentration = np.where(concentration > 10**2.5, 10**2.5, concentration)\n", + "# concentration = np.log10(concentration)\n", + "\n", + "fig, ax = plt.subplots(1, 1)\n", + "plt.contourf(\n", + " epoch_time,\n", + " np.array(header).astype(float),\n", + " concentration.T,\n", + " cmap=plt.cm.PuBu_r, levels=50)\n", + "plt.yscale('log')\n", + "ax.set_xlabel('Epoch Time')\n", + "ax.set_ylabel('Diameter (nm)')\n", + "plt.colorbar(label='Concentration dN/dlogDp [#/cm3]', ax=ax)\n", + "plt.show()\n", + "fig.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Settings Generator for 1d and 2d data\n", + "\n", + " Just like with the 1d data, we can use the settings generator to generate\n", + " the settings dictionary for importing the data. This is done by calling the\n", + " `settings_generator.for_general_sizer_1d_2d_load()` function.\n", + "\n", + " This function has a lot of arguments, but remember, if you just want the\n", + " default settings, you don't need to pass any arguments. The defaults are\n", + " set to the example data provided." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Settings 1d data dictionary:\n", + "relative_data_folder: SMPS_data\n", + "filename_regex: *.csv\n", + "MIN_SIZE_BYTES: 10\n", + "data_loading_function: general_1d_load\n", + "header_row: 24\n", + "data_checks: {'characters': [250], 'skip_rows': 25, 'skip_end': 0, 'char_counts': {'/': 2, ':': 2}}\n", + "data_column: ['Lower Size (nm)', 'Upper Size (nm)', 'Sample Temp (C)', 'Sample Pressure (kPa)', 'Relative Humidity (%)', 'Median (nm)', 'Mean (nm)', 'Geo. Mean (nm)', 'Mode (nm)', 'Geo. Std. Dev.', 'Total Conc. (#/cm³)']\n", + "data_header: ['Lower_Size_(nm)', 'Upper_Size_(nm)', 'Sample_Temp_(C)', 'Sample_Pressure_(kPa)', 'Relative_Humidity_(%)', 'Median_(nm)', 'Mean_(nm)', 'Geo_Mean_(nm)', 'Mode_(nm)', 'Geo_Std_Dev.', 'Total_Conc_(#/cc)']\n", + "time_column: [1, 2]\n", + "time_format: %m/%d/%Y %H:%M:%S\n", + "delimiter: ,\n", + "time_shift_seconds: 0\n", + "timezone_identifier: UTC\n", + "\n", + "Settings 2d data dictionary:\n", + "relative_data_folder: SMPS_data\n", + "filename_regex: *.csv\n", + "MIN_SIZE_BYTES: 10\n", + "data_loading_function: general_2d_load\n", + "header_row: 24\n", + "data_checks: {'characters': [250], 'skip_rows': 25, 'skip_end': 0, 'char_counts': {'/': 2, ':': 2}}\n", + "data_sizer_reader: {'Dp_start_keyword': 'Diameter Midpoint (nm)', 'Dp_end_keyword': 'Scan Time (s)', 'convert_scale_from': 'dw/dlogdp'}\n", + "time_column: [1, 2]\n", + "time_format: %m/%d/%Y %H:%M:%S\n", + "delimiter: ,\n", + "time_shift_seconds: 0\n", + "timezone_identifier: UTC\n" + ] + } + ], + "source": [ + "# Settings to load 1d and 2d data from the sizer or any other instrument\n", + "# that has a 1d and 2d data in the same file.\n", + "\n", + "settings_1d, settings_2d = settings_generator.for_general_sizer_1d_2d_load(\n", + " relative_data_folder='SMPS_data',\n", + " filename_regex='*.csv',\n", + " file_min_size_bytes=10,\n", + " header_row=24,\n", + " data_checks={\n", + " \"characters\": [250],\n", + " \"skip_rows\": 25,\n", + " \"skip_end\": 0,\n", + " \"char_counts\": {\"/\": 2, \":\": 2}\n", + " },\n", + " data_1d_column=[\n", + " \"Lower Size (nm)\",\n", + " \"Upper Size (nm)\",\n", + " \"Sample Temp (C)\",\n", + " \"Sample Pressure (kPa)\",\n", + " \"Relative Humidity (%)\",\n", + " \"Median (nm)\",\n", + " \"Mean (nm)\",\n", + " \"Geo. Mean (nm)\",\n", + " \"Mode (nm)\",\n", + " \"Geo. Std. Dev.\",\n", + " \"Total Conc. (#/cm³)\"],\n", + " data_1d_header=[\n", + " \"Lower_Size_(nm)\",\n", + " \"Upper_Size_(nm)\",\n", + " \"Sample_Temp_(C)\",\n", + " \"Sample_Pressure_(kPa)\",\n", + " \"Relative_Humidity_(%)\",\n", + " \"Median_(nm)\",\n", + " \"Mean_(nm)\",\n", + " \"Geo_Mean_(nm)\",\n", + " \"Mode_(nm)\",\n", + " \"Geo_Std_Dev.\",\n", + " \"Total_Conc_(#/cc)\"],\n", + " data_2d_dp_start_keyword=\"Diameter Midpoint (nm)\",\n", + " data_2d_dp_end_keyword=\"Scan Time (s)\",\n", + " data_2d_convert_concentration_from=\"dw/dlogdp\",\n", + " time_column=[1, 2],\n", + " time_format=\"%m/%d/%Y %H:%M:%S\",\n", + " delimiter=\",\",\n", + " time_shift_seconds=0,\n", + " timezone_identifier=\"UTC\",\n", + ")\n", + "\n", + "# print and format the settings dictionary\n", + "print('Settings 1d data dictionary:')\n", + "for key, value in settings_1d.items():\n", + " print(f'{key}: {value}')\n", + "\n", + "print('')\n", + "print('Settings 2d data dictionary:')\n", + "for key, value in settings_2d.items():\n", + " print(f'{key}: {value}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Load the data with the interface\n", + "\n", + " Now that we have the settings dictionary, we can use an interface\n", + " that will take the settings and locations and do all those steps from above.\n", + " Calling the relevant functions." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading data from: 2022-07-07_095151_SMPS.csv\n", + "Loading data from: 2022-07-10_094659_SMPS.csv\n", + "Loading data from: 2022-07-07_095151_SMPS.csv\n", + "Loading data from: 2022-07-10_094659_SMPS.csv\n" + ] + } + ], + "source": [ + "# import the interface\n", + "\n", + "working_path = get_data_folder()\n", + "\n", + "# settings from above\n", + "\n", + "# no call the loader interface\n", + "data_stream_1d = loader_interface.load_files_interface(\n", + " path=working_path,\n", + " settings=settings_1d,\n", + ")\n", + "\n", + "data_stream_2d = loader_interface.load_files_interface(\n", + " path=working_path,\n", + " settings=settings_2d,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Data stream 1d summary:\n", + "Stream(header=['Lower_Size_(nm)', 'Upper_Size_(nm)', 'Sample_Temp_(C)', 'Sample_Pressure_(kPa)', 'Relative_Humidity_(%)', 'Median_(nm)', 'Mean_(nm)', 'Geo_Mean_(nm)', 'Mode_(nm)', 'Geo_Std_Dev.', 'Total_Conc_(#/cc)'], data=array([[2.05000e+01, 2.05000e+01, 2.05000e+01, ..., 2.05000e+01,\n", + " 2.05000e+01, 2.05000e+01],\n", + " [7.91500e+02, 7.91500e+02, 7.91500e+02, ..., 7.91500e+02,\n", + " 7.91500e+02, 7.91500e+02],\n", + " [2.37000e+01, 2.36000e+01, 2.37000e+01, ..., 2.35000e+01,\n", + " 2.33000e+01, 2.35000e+01],\n", + " ...,\n", + " [2.07210e+01, 2.52550e+01, 2.18700e+01, ..., 2.07210e+01,\n", + " 2.10970e+01, 2.07210e+01],\n", + " [2.17900e+00, 2.10100e+00, 2.13600e+00, ..., 2.31800e+00,\n", + " 2.31800e+00, 2.24800e+00],\n", + " [2.16900e+03, 2.39408e+03, 2.27861e+03, ..., 2.08056e+03,\n", + " 2.10616e+03, 2.45781e+03]]), time=array([1.65718376e+09, 1.65718385e+09, 1.65718394e+09, ...,\n", + " 1.65753440e+09, 1.65753450e+09, 1.65753459e+09]), files=[['2022-07-07_095151_SMPS.csv', 5620804], ['2022-07-10_094659_SMPS.csv', 2004838]])\n", + "\n", + "Data stream 2d summary:\n", + "Stream(header=['20.72', '21.10', '21.48', '21.87', '22.27', '22.67', '23.08', '23.50', '23.93', '24.36', '24.80', '25.25', '25.71', '26.18', '26.66', '27.14', '27.63', '28.13', '28.64', '29.16', '29.69', '30.23', '30.78', '31.34', '31.91', '32.49', '33.08', '33.68', '34.29', '34.91', '35.55', '36.19', '36.85', '37.52', '38.20', '38.89', '39.60', '40.32', '41.05', '41.79', '42.55', '43.32', '44.11', '44.91', '45.73', '46.56', '47.40', '48.26', '49.14', '50.03', '50.94', '51.86', '52.80', '53.76', '54.74', '55.73', '56.74', '57.77', '58.82', '59.89', '60.98', '62.08', '63.21', '64.36', '65.52', '66.71', '67.93', '69.16', '70.41', '71.69', '72.99', '74.32', '75.67', '77.04', '78.44', '79.86', '81.31', '82.79', '84.29', '85.82', '87.38', '88.96', '90.58', '92.22', '93.90', '95.60', '97.34', '99.10', '100.90', '102.74', '104.60', '106.50', '108.43', '110.40', '112.40', '114.44', '116.52', '118.64', '120.79', '122.98', '125.21', '127.49', '129.80', '132.16', '134.56', '137.00', '139.49', '142.02', '144.60', '147.22', '149.89', '152.61', '155.38', '158.20', '161.08', '164.00', '166.98', '170.01', '173.09', '176.24', '179.43', '182.69', '186.01', '189.38', '192.82', '196.32', '199.89', '203.51', '207.21', '210.97', '214.80', '218.70', '222.67', '226.71', '230.82', '235.01', '239.28', '243.62', '248.05', '252.55', '257.13', '261.80', '266.55', '271.39', '276.32', '281.33', '286.44', '291.64', '296.93', '302.32', '307.81', '313.40', '319.08', '324.88', '330.77', '336.78', '342.89', '349.12', '355.45', '361.90', '368.47', '375.16', '381.97', '388.91', '395.96', '403.15', '410.47', '417.92', '425.51', '433.23', '441.09', '449.10', '457.25', '465.55', '474.00', '482.61', '491.37', '500.29', '509.37', '518.61', '528.03', '537.61', '547.37', '557.31', '567.42', '577.72', '588.21', '598.89', '609.76', '620.82', '632.09', '643.57', '655.25', '667.14', '679.25', '691.58', '704.14', '716.92', '729.93', '743.18', '756.67', '770.40', '784.39'], data=array([[ 48.61238578, 44.77267394, 41.14076315, ..., 79.34844804,\n", + " 69.82018511, 114.54210553],\n", + " [ 22.15597542, 45.89533787, 38.87339256, ..., 62.46996336,\n", + " 87.41141628, 90.13918579],\n", + " [ 37.32620324, 49.15319788, 34.00511123, ..., 68.46352554,\n", + " 64.25811167, 107.50020941],\n", + " ...,\n", + " [ 0.73615212, 0. , 0. , ..., 0. ,\n", + " 0. , 0. ],\n", + " [ 0.97000629, 0. , 0.97000629, ..., 0. ,\n", + " 0. , 0. ],\n", + " [ 0. , 0.59493286, 0.9793736 , ..., 0.97991031,\n", + " 2.93952571, 0. ]]), time=array([1.65718376e+09, 1.65718385e+09, 1.65718394e+09, ...,\n", + " 1.65753440e+09, 1.65753450e+09, 1.65753459e+09]), files=[['2022-07-07_095151_SMPS.csv', 5620804], ['2022-07-10_094659_SMPS.csv', 2004838]])\n" + ] + } + ], + "source": [ + "# print data stream summary\n", + "\n", + "print('')\n", + "print('Data stream 1d summary:')\n", + "print(data_stream_1d)\n", + "\n", + "print('')\n", + "print('Data stream 2d summary:')\n", + "print(data_stream_2d)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on Stream in module particula.data.stream object:\n", + "\n", + "class Stream(builtins.object)\n", + " | Stream(header: List[str] = , data: numpy.ndarray = , time: numpy.ndarray = , files: List[str] = ) -> None\n", + " | \n", + " | A class for consistent data storage and format.\n", + " | \n", + " | Attributes:\n", + " | ---------\n", + " | header : List[str]\n", + " | A list of strings representing the header of the data stream.\n", + " | data : np.ndarray\n", + " | A numpy array representing the data stream.\n", + " | time : np.ndarray\n", + " | A numpy array representing the time stream.\n", + " | files : List[str]\n", + " | A list of strings representing the files containing the data stream.\n", + " | \n", + " | Methods:\n", + " | -------\n", + " | validate_inputs\n", + " | Validates the inputs to the Stream class.\n", + " | datetime64 -> np.ndarray\n", + " | Returns an array of datetime64 objects representing the time stream.\n", + " | Useful for plotting, with matplotlib.dates.\n", + " | return_header_dict -> dict\n", + " | Returns the header as a dictionary with keys as header elements and\n", + " | values as their indices.\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __eq__(self, other)\n", + " | \n", + " | __init__(self, header: List[str] = , data: numpy.ndarray = , time: numpy.ndarray = , files: List[str] = ) -> None\n", + " | \n", + " | __post_init__(self)\n", + " | \n", + " | __repr__(self)\n", + " | \n", + " | validate_inputs(self)\n", + " | Validates the inputs for the DataStream object.\n", + " | \n", + " | Raises:\n", + " | TypeError: If header is not a list.\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Readonly properties defined here:\n", + " | \n", + " | datetime64\n", + " | Returns an array of datetime64 objects representing the time stream.\n", + " | Useful for plotting, with matplotlib.dates.\n", + " | \n", + " | return_header_dict\n", + " | Returns the header as a dictionary with index (0, 1) as the keys\n", + " | and the names as values.\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data descriptors defined here:\n", + " | \n", + " | __dict__\n", + " | dictionary for instance variables (if defined)\n", + " | \n", + " | __weakref__\n", + " | list of weak references to the object (if defined)\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes defined here:\n", + " | \n", + " | __annotations__ = {'data': , 'files': typing.Li...\n", + " | \n", + " | __dataclass_fields__ = {'data': Field(name='data',type=" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# settin limits helps to see the data better, you can also use np.log10()\n", + "# to plot the concentration in log space\n", + "concentration = data_stream_2d.data\n", + "concentration = np.where(concentration < 1e-5, 1e-5, concentration)\n", + "concentration = np.where(concentration > 10**2.5, 10**2.5, concentration)\n", + "# concentration = np.log10(concentration)\n", + "\n", + "fig, ax = plt.subplots(1, 1)\n", + "plt.contourf(\n", + " data_stream_2d.datetime64,\n", + " np.array(data_stream_2d.header).astype(float),\n", + " concentration,\n", + " cmap=plt.cm.PuBu_r, levels=50)\n", + "plt.yscale('log')\n", + "plt.tick_params(rotation=35) # rotate the x axis labels\n", + "ax.set_xlabel(\"Time (UTC)\")\n", + "ax.set_ylabel('Diameter (nm)')\n", + "plt.colorbar(label='Concentration dN/dlog(Dp) [#/cm3]', ax=ax)\n", + "plt.show()\n", + "fig.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Summary\n", + "\n", + " This example covered loading data from a file that has 2 dimensions, such\n", + " as a size distribution. It covered the following:\n", + "\n", + " - Setting the working path\n", + " - Loading the data\n", + " - Formatting the data\n", + " - Plotting the data\n", + " - Generating the settings dictionary\n", + " - Loading the data with the interface\n", + " - Plotting the data stream" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ParticulaDev_py39", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/particula/data/loader.py b/particula/data/loader.py index 4338c2236..bc65f9b2c 100644 --- a/particula/data/loader.py +++ b/particula/data/loader.py @@ -351,6 +351,7 @@ def general_data_formatter( time_column: Union[int, List[int]], time_format: str, delimiter: str = ',', + header_row: int = 0, date_offset: str = None, seconds_shift: int = 0, timezone_identifier: str = 'UTC' @@ -383,6 +384,14 @@ def general_data_formatter( A tuple containing two np.array objects: the first contains the epoch times, and the second contains the data. """ + + # find str matching in header row and gets index + if isinstance(data_column[0], str): + data_header = data[header_row].split(delimiter) + # Get data column indices + data_column = [data_header.index(x) + for x in data_column] + # Check the data format data = data_format_checks(data, data_checks) @@ -408,6 +417,7 @@ def sizer_data_formatter( time_column: int, time_format: str, delimiter: str = ',', + header_row: int = 0, date_offset: str = None, seconds_shift: int = 0, timezone_identifier: str = 'UTC' @@ -443,25 +453,33 @@ def sizer_data_formatter( """ # Get Dp range and columns - data_header = data[data_sizer_reader["header_rows"]].split(delimiter) + data_header = data[header_row].split(delimiter) + # check if start and end keywords are in the header + if data_sizer_reader["Dp_start_keyword"] not in data_header: + # rise error with snip of data header + raise ValueError( + f"Cannot find '{data_sizer_reader['Dp_start_keyword']}' in header"\ + + f" {data_header[:20]}..." + ) + if data_sizer_reader["Dp_end_keyword"] not in data_header: + # rise error with snip of data header + raise ValueError( + f"Cannot find '{data_sizer_reader['Dp_end_keyword']}' in header"\ + + f" {data_header[:20]}..." + ) dp_range = [ data_header.index(data_sizer_reader["Dp_start_keyword"]), data_header.index(data_sizer_reader["Dp_end_keyword"]) ] dp_columns = list(range(dp_range[0]+1, dp_range[1])) - dp_header = [data_header[i] for i in dp_columns] + header = [data_header[i] for i in dp_columns] # change from np.array - # Get data columns - data_column = [ - data_header.index(x) for x in data_sizer_reader["list_of_data_headers"] - ] - # Format data data = data_format_checks(data, data_checks) # Get data arrays - epoch_time, data_smps_2d = sample_data( + epoch_time, data_2d = sample_data( data, time_column, time_format, @@ -471,16 +489,6 @@ def sizer_data_formatter( seconds_shift=seconds_shift, timezone_identifier=timezone_identifier ) - epoch_time, data_smps_1d = sample_data( - data, - time_column, - time_format, - data_column, - delimiter, - date_offset, - seconds_shift=seconds_shift, - timezone_identifier=timezone_identifier - ) if "convert_scale_from" in data_sizer_reader: if data_sizer_reader["convert_scale_from"] == "dw": @@ -493,13 +501,13 @@ def sizer_data_formatter( " Either dw/dlogdp or dw must be specified." ) for i in range(len(epoch_time)): - data_smps_2d[i, :] = convert.convert_sizer_dn( - diameter=np.array(dp_header).astype(float), - dn_dlogdp=data_smps_2d[i, :], + data_2d[i, :] = convert.convert_sizer_dn( + diameter=np.array(header).astype(float), + dn_dlogdp=data_2d[i, :], inverse=inverse ) - return epoch_time, dp_header, data_smps_2d, data_smps_1d + return epoch_time, data_2d, header def non_standard_date_location( diff --git a/particula/data/loader_interface.py b/particula/data/loader_interface.py index 3930cabf3..28ad4ebfd 100644 --- a/particula/data/loader_interface.py +++ b/particula/data/loader_interface.py @@ -141,13 +141,19 @@ def load_files_interface( stream.files.append(file_info[file_i]) # add file info as loaded first_pass = False - # elif (self.settings[key]['data_loading_function'] == - # 'general_2d_sizer_load'): - # self.initialise_2d_datastream(key, path, first_pass) - # first_pass = False + elif settings['data_loading_function'] == 'general_2d_load': + stream = get_2d_stream( + file_path=file_path, + first_pass=first_pass, + settings=settings, + stream=stream + ) + stream.files.append(file_info[file_i]) # add file info as loaded + first_pass = False + # elif (self.settings[key]['data_loading_function'] == # 'netcdf_load'): - # self.initialise_netcdf_datastream(key, path, first_pass) + # self.initialise_netcdf_stream(key, path, first_pass) # first_pass = False else: raise ValueError('Data loading function not recognized', @@ -205,16 +211,17 @@ def get_1d_stream( time=np.array([]), files=[] ) - # Input validation + # Input validation, should it be abstracted? if not isinstance(settings, dict): raise TypeError("The setting parameters must be in a dictionary.") required_keys = ['data_checks', 'data_column', 'time_column', 'time_format', 'delimiter', 'time_shift_seconds', - 'timezone_identifier', 'data_header'] + 'timezone_identifier', 'data_header', 'header_row'] if any(key not in settings for key in required_keys): - raise KeyError(f"The settings dictionary is missing required keys: \ - {required_keys}") + missing_key = [key for key in required_keys if key not in settings] + raise KeyError( + f"The settings dictionary is missing required keys: {missing_key}") if not os.path.isfile(file_path): raise FileNotFoundError(f"The file path specified does not exist: \ @@ -241,6 +248,7 @@ def get_1d_stream( time_column=settings['time_column'], time_format=settings['time_format'], delimiter=settings['delimiter'], + header_row=settings['header_row'], date_offset=date_offset, seconds_shift=settings['time_shift_seconds'], timezone_identifier=settings['timezone_identifier'] @@ -266,65 +274,108 @@ def get_1d_stream( return stream -# def initialise_2d_datastream( -# self, -# key: str, -# path: str, -# first_pass: bool -# ) -> None: -# """ -# Initializes a 2D datastream using the settings in the DataLake object. +def get_2d_stream( + file_path: str, + settings: dict, + first_pass: bool = True, + stream: Optional[object] = None, +) -> object: + """ + Initializes a 2D stream using the settings in the DataLake object. -# Parameters: -# ---------- -# key (str): The key of the datastream to initialise. -# path (str): The path of the file to load data from. -# first_pass (bool): Whether this is the first time loading data. + Parameters: + ---------- + key (str): The key of the stream to initialise. + path (str): The path of the file to load data from. + first_pass (bool): Whether this is the first time loading data. -# Returns: -# ---------- -# None. -# """ -# epoch_time, dp_header, data_2d, data_1d = self.import_sizer_data( -# path=path, -# key=key -# ) -# if first_pass: -# self.datastreams[ -# self.settings[key]['data_stream_name'][0]] = DataStream( -# header_list=self.settings[key]['data_header'], -# average_times=[600], -# average_base=self.settings[key]['base_interval_sec'] -# ) -# self.datastreams[ -# self.settings[key]['data_stream_name'][1]] = DataStream( -# header_list=dp_header, -# average_times=[600], -# average_base=self.settings[key]['base_interval_sec'] -# ) -# self.datastreams[self.settings[key]['data_stream_name'][0]].add_data( -# time_stream=epoch_time, -# data_stream=data_1d, -# ) -# self.datastreams[self.settings[key]['data_stream_name'][1]].add_data( -# time_stream=epoch_time, -# data_stream=data_2d, -# header_check=True, -# header=dp_header -# ) -# def initialise_netcdf_datastream( + Returns: + ---------- + None. + """ + if stream is None: + stream = Stream( + header=[], + data=np.array([]), + time=np.array([]), + files=[] + ) + # Input validation + if not isinstance(settings, dict): + raise TypeError("The setting parameters must be in a dictionary.") + + required_keys = ['data_checks', 'time_column', + 'time_format', 'delimiter', 'time_shift_seconds', + 'timezone_identifier', 'data_sizer_reader', + 'header_row'] + if any(key not in settings for key in required_keys): + missing_key = [key for key in required_keys if key not in settings] + raise KeyError( + f"The settings dictionary is missing required keys: {missing_key}") + + if not os.path.isfile(file_path): + raise FileNotFoundError(f"The file path specified does not exist: \ + {file_path}") + + if not isinstance(first_pass, bool): + raise TypeError("The first_pass parameter must be a boolean.") + + # should should consolidate and abstract this + data = loader.data_raw_loader(file_path=file_path) + if 'date_location' in settings: + date_offset = loader.non_standard_date_location( + data=data, + date_location=settings['date_location'] + ) + else: + date_offset = None + + epoch_time, data, header = loader.sizer_data_formatter( + data=data, + data_checks=settings['data_checks'], + data_sizer_reader=settings['data_sizer_reader'], + time_column=settings['time_column'], + time_format=settings['time_format'], + delimiter=settings['delimiter'], + header_row=settings['header_row'], + date_offset=date_offset, + seconds_shift=settings['time_shift_seconds'], + timezone_identifier=settings['timezone_identifier'] + ) + + # check data shape + data = convert.data_shape_check( + time=epoch_time, + data=data, + header=header) + if first_pass: + stream.header = header + stream.data = data + stream.time = epoch_time + else: + stream = merger.stream_add_data( + stream=stream, + time_new=epoch_time, + data_new=data, + header_check=True, + header_new=header + ) + return stream + + +# def initialise_netcdf_stream( # self, # key: str, # path: str, # first_pass: bool # ) -> None: # """ -# Initialise a netcdf datastream using the settings in the DataLake +# Initialise a netcdf stream using the settings in the DataLake # object. This can load either 1D or 2D data, as specified in the # settings. # Parameters: # ---------- -# key (str): The key of the datastream to initialise. +# key (str): The key of the stream to initialise. # path (str): The path of the file to load data from. # first_pass (bool): Whether this is the first time loading data. @@ -342,16 +393,16 @@ def get_1d_stream( # file_path=path, # settings=self.settings[key]) -# if first_pass: # create the datastream -# self.datastreams[ +# if first_pass: # create the stream +# self.streams[ # self.settings[key]['data_stream_name'][0] -# ] = DataStream( +# ] = stream( # header_list=header_1d, # average_times=[600], # average_base=self.settings[key]['base_interval_sec'] # ) -# self.datastreams[ +# self.streams[ # self.settings[key]['data_stream_name'][0] # ].add_data( # time_stream=epoch_time, @@ -363,16 +414,16 @@ def get_1d_stream( # file_path=path, # settings=self.settings[key]) -# if first_pass: # create the datastream -# self.datastreams[ +# if first_pass: # create the stream +# self.streams[ # self.settings[key]['data_stream_name'][1] -# ] = DataStream( +# ] = stream( # header_list=header_2d, # average_times=[600], # average_base=self.settings[key]['base_interval_sec'] # ) -# self.datastreams[ +# self.streams[ # self.settings[key]['data_stream_name'][1] # ].add_data( # time_stream=epoch_time, diff --git a/particula/data/settings_generator.py b/particula/data/settings_generator.py index 509bab0e6..543f73505 100644 --- a/particula/data/settings_generator.py +++ b/particula/data/settings_generator.py @@ -9,8 +9,9 @@ def for_general_1d_load( relative_data_folder: str = 'instrument_data', filename_regex: str = '*.csv', file_min_size_bytes: int = 10, + header_row: int = 0, data_checks: Optional[dict] = None, - data_column: List[int] = [3, 5], + data_column: list = [3, 5], data_header: List[str] = ['data 1', 'data 3'], time_column: List[int] = [0, 1], time_format: str = '%Y-%m-%d %H:%M:%S.%f', @@ -31,6 +32,7 @@ def for_general_1d_load( 'filename_regex': filename_regex, 'MIN_SIZE_BYTES': file_min_size_bytes, 'data_loading_function': 'general_1d_load', + 'header_row': header_row, 'data_checks': data_checks, 'data_column': data_column, 'data_header': data_header, @@ -40,3 +42,71 @@ def for_general_1d_load( 'time_shift_seconds': time_shift_seconds, 'timezone_identifier': timezone_identifier, } + + +def for_general_sizer_1d_2d_load( + relative_data_folder: str = 'instrument_data', + filename_regex: str = '*.csv', + file_min_size_bytes: int = 10, + header_row: int = 0, + data_checks: Optional[dict] = None, + data_1d_column: list = [3, 5], + data_1d_header: List[str] = ['data 1', 'data 3'], + data_2d_dp_start_keyword: str = "Date Time", + data_2d_dp_end_keyword: str = "Total Conc", + data_2d_convert_concentration_from: str = "dw/dlogdp", # or dw + time_column: List[int] = [0, 1], + time_format: str = '%Y-%m-%d %H:%M:%S.%f', + delimiter: str = ',', + time_shift_seconds: int = 0, + timezone_identifier: str = 'UTC', +) -> tuple: + """Generate settings file for 1d general file loader and + 2d general sizer file loader. + + Returns + ------- + Tuple[dict, dict] + The settings for the 1d loader and the 2d loader. + """ + if data_checks is None: + data_checks = { + "characters": [10, 100], + "char_counts": {",": 4, ":": 0}, + "skip_rows": 0, + "skip_end": 0, + }, + settings_1d = { + 'relative_data_folder': relative_data_folder, + 'filename_regex': filename_regex, + 'MIN_SIZE_BYTES': file_min_size_bytes, + 'data_loading_function': 'general_1d_load', + 'header_row': header_row, + 'data_checks': data_checks, + 'data_column': data_1d_column, + 'data_header': data_1d_header, + 'time_column': time_column, + 'time_format': time_format, + 'delimiter': delimiter, + 'time_shift_seconds': time_shift_seconds, + 'timezone_identifier': timezone_identifier, + } + settings_2d = { + 'relative_data_folder': relative_data_folder, + 'filename_regex': filename_regex, + 'MIN_SIZE_BYTES': file_min_size_bytes, + 'data_loading_function': 'general_2d_load', + 'header_row': header_row, + 'data_checks': data_checks, + 'data_sizer_reader': { + "Dp_start_keyword": data_2d_dp_start_keyword, + "Dp_end_keyword": data_2d_dp_end_keyword, + "convert_scale_from": data_2d_convert_concentration_from, + }, + 'time_column': time_column, + 'time_format': time_format, + 'delimiter': delimiter, + 'time_shift_seconds': time_shift_seconds, + 'timezone_identifier': timezone_identifier, + } + return settings_1d, settings_2d diff --git a/particula/data/stream.py b/particula/data/stream.py index eeb14bc38..362d2ddcf 100644 --- a/particula/data/stream.py +++ b/particula/data/stream.py @@ -1,4 +1,5 @@ """A module for the Stream and StreamAveraged(Stream) classes.""" +# pytype: skip-file from typing import List from dataclasses import dataclass, field