Skip to content

Commit

Permalink
Added unittest for missing values flag
Browse files Browse the repository at this point in the history
  • Loading branch information
holukas committed Aug 28, 2024
1 parent 377cedf commit 437dbee
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 44 deletions.
2 changes: 1 addition & 1 deletion diive/pkgs/qaqc/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __init__(self, series: Series, idstr: str = None, verbose: bool = False):
super().__init__(series=series, flagid=self.flagid, idstr=idstr)
self.verbose = verbose

def calc(self):
def calc(self, repeat=False):
self._overall_flag, n_iterations = self.repeat(self.run_flagtests, repeat=False)
# if self.showplot:
# self.defaultplot(n_iterations=1)
Expand Down
77 changes: 37 additions & 40 deletions notebooks/OutlierDetection/MissingValues.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -65,24 +65,21 @@
"metadata": {
"tags": [],
"ExecuteTime": {
"end_time": "2024-08-28T09:07:58.635169Z",
"start_time": "2024-08-28T09:07:56.890714Z"
"end_time": "2024-08-28T10:56:09.883777Z",
"start_time": "2024-08-28T10:56:09.872776Z"
}
},
"source": [
"import importlib.metadata\n",
"import warnings\n",
"from datetime import datetime\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import diive.configs.exampledata as ed\n",
"from diive.core.plotting.timeseries import TimeSeries\n",
"from diive.pkgs.qaqc.flags import MissingValues\n",
"\n",
"\n",
"warnings.filterwarnings('ignore')\n",
"version_diive = importlib.metadata.version(\"diive\")\n",
"print(f\"diive version: v{version_diive}\")"
Expand All @@ -96,7 +93,7 @@
]
}
],
"execution_count": 1
"execution_count": 11
},
{
"cell_type": "markdown",
Expand All @@ -117,8 +114,8 @@
"metadata": {
"tags": [],
"ExecuteTime": {
"end_time": "2024-08-28T09:07:58.650676Z",
"start_time": "2024-08-28T09:07:58.646169Z"
"end_time": "2024-08-28T10:56:10.008522Z",
"start_time": "2024-08-28T10:56:09.996018Z"
}
},
"source": "help(MissingValues)",
Expand Down Expand Up @@ -146,7 +143,7 @@
]
}
],
"execution_count": 2
"execution_count": 12
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -178,8 +175,8 @@
},
"tags": [],
"ExecuteTime": {
"end_time": "2024-08-28T09:07:58.873229Z",
"start_time": "2024-08-28T09:07:58.763049Z"
"end_time": "2024-08-28T10:56:10.150031Z",
"start_time": "2024-08-28T10:56:10.040523Z"
}
},
"source": [
Expand All @@ -194,7 +191,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded .parquet file L:\\Sync\\luhk_work\\20 - CODING\\21 - DIIVE\\diive\\diive\\configs\\exampledata\\exampledata_PARQUET_CH-DAV_FP2022.5_2013-2022_ID20230206154316_30MIN.parquet (0.041 seconds). Detected time resolution of <30 * Minutes> / 30min \n"
"Loaded .parquet file L:\\Sync\\luhk_work\\20 - CODING\\21 - DIIVE\\diive\\diive\\configs\\exampledata\\exampledata_PARQUET_CH-DAV_FP2022.5_2013-2022_ID20230206154316_30MIN.parquet (0.028 seconds). Detected time resolution of <30 * Minutes> / 30min \n"
]
},
{
Expand All @@ -215,18 +212,18 @@
"Freq: 30min, Name: Tair_f, Length: 1488, dtype: float64"
]
},
"execution_count": 3,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 3
"execution_count": 13
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-28T09:07:59.255528Z",
"start_time": "2024-08-28T09:07:58.968945Z"
"end_time": "2024-08-28T10:56:10.482095Z",
"start_time": "2024-08-28T10:56:10.198036Z"
}
},
"cell_type": "code",
Expand All @@ -243,7 +240,7 @@
"output_type": "display_data"
}
],
"execution_count": 4
"execution_count": 14
},
{
"cell_type": "markdown",
Expand All @@ -254,8 +251,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-28T09:07:59.557653Z",
"start_time": "2024-08-28T09:07:59.287556Z"
"end_time": "2024-08-28T10:56:10.783128Z",
"start_time": "2024-08-28T10:56:10.515112Z"
}
},
"source": [
Expand All @@ -275,7 +272,7 @@
"output_type": "display_data"
}
],
"execution_count": 5
"execution_count": 15
},
{
"cell_type": "markdown",
Expand All @@ -298,8 +295,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-28T09:07:59.573653Z",
"start_time": "2024-08-28T09:07:59.563653Z"
"end_time": "2024-08-28T10:56:10.814637Z",
"start_time": "2024-08-28T10:56:10.796130Z"
}
},
"source": [
Expand All @@ -315,14 +312,14 @@
]
}
],
"execution_count": 6
"execution_count": 16
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-28T09:07:59.666774Z",
"start_time": "2024-08-28T09:07:59.653773Z"
"end_time": "2024-08-28T10:56:10.845638Z",
"start_time": "2024-08-28T10:56:10.832639Z"
}
},
"source": [
Expand All @@ -348,22 +345,22 @@
"Freq: 30min, Name: FLAG_Tair_f_MISSING_TEST, Length: 1488, dtype: float64"
]
},
"execution_count": 7,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 7
"execution_count": 17
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-28T09:07:59.728771Z",
"start_time": "2024-08-28T09:07:59.714771Z"
"end_time": "2024-08-28T10:56:10.908149Z",
"start_time": "2024-08-28T10:56:10.893640Z"
}
},
"cell_type": "code",
"source": "print(f\"Number of missing values: {int(flag.loc[flag==2].count())}\")",
"source": "print(f\"Number of missing values: {int(flag.loc[flag == 2].count())}\")",
"outputs": [
{
"name": "stdout",
Expand All @@ -373,14 +370,14 @@
]
}
],
"execution_count": 8
"execution_count": 18
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-28T09:08:00.029407Z",
"start_time": "2024-08-28T09:07:59.852898Z"
"end_time": "2024-08-28T10:56:11.081663Z",
"start_time": "2024-08-28T10:56:10.962149Z"
}
},
"source": "flag.plot(x_compat=True, title=\"Missing values (flag=2)\");",
Expand All @@ -396,7 +393,7 @@
"output_type": "display_data"
}
],
"execution_count": 9
"execution_count": 19
},
{
"cell_type": "markdown",
Expand All @@ -419,8 +416,8 @@
"metadata": {
"tags": [],
"ExecuteTime": {
"end_time": "2024-08-28T09:08:00.060921Z",
"start_time": "2024-08-28T09:08:00.044411Z"
"end_time": "2024-08-28T10:56:11.145175Z",
"start_time": "2024-08-28T10:56:11.133175Z"
}
},
"source": [
Expand All @@ -432,18 +429,18 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Finished 2024-08-28 11:08:00\n"
"Finished 2024-08-28 12:56:11\n"
]
}
],
"execution_count": 10
"execution_count": 20
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-28T09:08:00.092977Z",
"start_time": "2024-08-28T09:08:00.080979Z"
"end_time": "2024-08-28T10:56:11.175175Z",
"start_time": "2024-08-28T10:56:11.165176Z"
}
},
"source": [],
Expand Down
25 changes: 22 additions & 3 deletions tests/test_outlierdetection.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,42 @@
import unittest

import numpy as np
import pandas as pd

import diive.configs.exampledata as ed
from diive.pkgs.createvar.noise import add_impulse_noise
from diive.pkgs.outlierdetection.absolutelimits import AbsoluteLimits, AbsoluteLimitsDaytimeNighttime
from diive.pkgs.outlierdetection.hampel import Hampel, HampelDaytimeNighttime
from diive.pkgs.outlierdetection.incremental import zScoreIncrements
from diive.pkgs.outlierdetection.localsd import LocalSD
from diive.pkgs.outlierdetection.lof import LocalOutlierFactorAllData
from diive.pkgs.outlierdetection.zscore import zScore, zScoreDaytimeNighttime
from diive.pkgs.outlierdetection.hampel import Hampel, HampelDaytimeNighttime
from diive.pkgs.outlierdetection.trim import TrimLow
from diive.pkgs.outlierdetection.zscore import zScore, zScoreDaytimeNighttime
from diive.pkgs.qaqc.flags import MissingValues


# kudos https://medium.com/@ms_somanna/guide-to-adding-noise-to-your-data-using-python-and-numpy-c8be815df524

class TestOutlierDetection(unittest.TestCase):

def test_missing_values(self):
df = ed.load_exampledata_parquet()
s = df['Tair_f'].copy()
s = s.loc[s.index.year == 2018].copy()
s = s.loc[s.index.month == 7].copy()
# Delete some data points
s.iloc[500:600] = np.nan
s.iloc[721:791] = np.nan
mv = MissingValues(series=s)
mv.calc()
flag = mv.get_flag()
n_missing_vals = int(flag.loc[flag == 2].count())
n_available_vals = int(flag.loc[flag == 0].count())
n_total_vals = n_available_vals + n_missing_vals
self.assertEqual(n_missing_vals, int(s.isnull().sum()))
self.assertEqual(n_available_vals, int(s.count()))
self.assertEqual(n_total_vals, len(s))

def test_trim_low_nt(self):
df = ed.load_exampledata_parquet()
s = df['Tair_f'].copy()
Expand Down Expand Up @@ -322,7 +342,6 @@ def test_localsd(self):
self.assertEqual(gooddata_stats.loc['max']['flag'], 0)
self.assertEqual(gooddata_stats.loc['count']['s_noise'], 1444)


def test_zscore_increments(self):
df = ed.load_exampledata_parquet()
s = df['Tair_f'].copy()
Expand Down

0 comments on commit 437dbee

Please sign in to comment.