From 79d7838b0ac354bc219b3de9488ec9ce3a3c94ab Mon Sep 17 00:00:00 2001 From: Heiko Strathmann Date: Tue, 9 Jul 2019 10:42:15 +0100 Subject: [PATCH] Systematic framework tests for all machines --- src/shogun/machine/BaggingMachine.cpp | 2 +- src/shogun/machine/Machine.cpp | 4 +- .../evaluation/CrossValidation_unittest.cc | 206 ---------- tests/unit/machine/all_machines_unittest.cc | 384 ++++++++++++++++++ tests/unit/utils/SGObjectIterator.h | 3 +- 5 files changed, 389 insertions(+), 210 deletions(-) delete mode 100644 tests/unit/evaluation/CrossValidation_unittest.cc create mode 100644 tests/unit/machine/all_machines_unittest.cc diff --git a/src/shogun/machine/BaggingMachine.cpp b/src/shogun/machine/BaggingMachine.cpp index f7cc98bd854..1f6a733cb4a 100644 --- a/src/shogun/machine/BaggingMachine.cpp +++ b/src/shogun/machine/BaggingMachine.cpp @@ -309,7 +309,7 @@ void CBaggingMachine::init() m_features = NULL; m_combination_rule = NULL; m_labels = NULL; - m_num_bags = 0; + m_num_bags = 100; m_bag_size = 0; m_all_oob_idx = SGVector(); m_oob_indices = NULL; diff --git a/src/shogun/machine/Machine.cpp b/src/shogun/machine/Machine.cpp index a40228ae399..f0159c58235 100644 --- a/src/shogun/machine/Machine.cpp +++ b/src/shogun/machine/Machine.cpp @@ -37,7 +37,7 @@ bool CMachine::train(CFeatures* data) if (train_require_labels()) { if (m_labels == NULL) - SG_ERROR("%s@%p: No labels given", get_name(), this) + SG_ERROR("No labels provided.\n", get_name()) m_labels->ensure_valid(get_name()); } @@ -47,7 +47,7 @@ bool CMachine::train(CFeatures* data) if (support_feature_dispatching()) { - REQUIRE(data != NULL, "Features not provided!"); + REQUIRE(data != NULL, "No features provided.\n"); REQUIRE( data->get_num_vectors() == m_labels->get_num_labels(), "Number of training vectors (%d) does not match number of " diff --git a/tests/unit/evaluation/CrossValidation_unittest.cc b/tests/unit/evaluation/CrossValidation_unittest.cc deleted file mode 100644 index f1df0dfd44d..00000000000 --- a/tests/unit/evaluation/CrossValidation_unittest.cc +++ /dev/null @@ -1,206 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Heiko Strathmann - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - - - -using namespace shogun; - - -#include - -using namespace shogun; -using namespace std; - -template -class CrossValidationTests : public ::testing::Test -{ -protected: - void SetUp() - { - } - - void init() - { - machine = new T(); - SG_REF(machine) - - this->generate_data(this->machine->get_machine_problem_type()); - - if (auto* casted = dynamic_cast(machine)) - casted->set_kernel(new CGaussianKernel()); - if (auto* casted = dynamic_cast(machine)) - casted->set_distance(new CEuclideanDistance()); - - auto ss = new CCrossValidationSplitting(labels, 5); - CEvaluation* ec = nullptr; - switch (machine->get_machine_problem_type()) - { - case PT_BINARY: - ec = new CAccuracyMeasure(); - break; - case PT_MULTICLASS: - ec = new CMulticlassAccuracy(); - break; - case PT_REGRESSION: - ec = new CMeanSquaredError(); - break; - default: - SG_SNOTIMPLEMENTED - break; - } - cv = new CCrossValidation(machine, features, labels, ss, ec); - cv->set_num_runs(3); - SG_REF(cv); - } - - void clean() - { - SG_UNREF(features) - SG_UNREF(labels) - SG_UNREF(machine) - SG_UNREF(cv); - } - - void TearDown() - { - } - - auto test_single_thread() - { - init(); - this->cv->put("seed", 1); - get_global_parallel()->set_num_threads(1); - auto result = cv->evaluate()->get("mean"); - clean(); - return result; - } - - auto test_multi_thread() - { - init(); - this->cv->put("seed", 1); - get_global_parallel()->set_num_threads(4); - auto result = cv->evaluate()->get("mean"); - clean(); - return result; - } - - void generate_data(EProblemType pt) - { - auto N = 50; - auto D = 5; - - std::mt19937_64 prng(57); - NormalDistribution randn; - UniformRealDistribution rand(0,1); - UniformIntDistribution randi(0,2); - - SGMatrix X(D,N); - for (auto i : range(D*N)) - X.matrix[i] = randn(prng); - features = new CDenseFeatures(X); - - SGVector y_reg(N); - SGVector y_binary(N); - SGVector y_mc(N); - - for (auto i : range(N)) - { - auto redux = linalg::mean(X.get_column(i)); - - y_reg[i] = redux + std::sin(redux) + 1; - y_mc[i] = redux<0 ? 0 : 1; - y_binary[i] = y_mc[i] * 2 -1; - - // noise - y_reg[i] += randn(prng)*0.1; - if (rand(prng)>0.1) - { - y_binary[i] *= (-1); - y_mc[i] = (int32_t(y_mc[i]) + randi(prng)) % 3; - } - } - - switch (pt) - { - case PT_BINARY: - case PT_CLASS: - { - labels = new CBinaryLabels(y_binary); - break; - } - - case PT_MULTICLASS: - { - labels = new CMulticlassLabels(y_mc); - break; - } - - case PT_REGRESSION: - labels = new CRegressionLabels(y_reg); - break; - - default: - SG_SERROR("Unsupported problem type: %d\n", pt); - FAIL(); - } - - SG_REF(features) - SG_REF(labels) - } - - CFeatures* features; - CLabels* labels; - CCrossValidation* cv; - T* machine; -}; - -typedef ::testing::Types -MachineTypes; - -TYPED_TEST_CASE(CrossValidationTests, MachineTypes); - -TYPED_TEST(CrossValidationTests, execute_single_thread) -{ - this->test_single_thread(); -} - -TYPED_TEST(CrossValidationTests, execute_multi_thread) -{ - this->test_multi_thread(); -} - -TYPED_TEST(CrossValidationTests, single_multi_same_result) -{ - auto single = this->test_single_thread(); - auto multi = this->test_multi_thread(); - - EXPECT_NEAR(single, multi, 1e-7); -} diff --git a/tests/unit/machine/all_machines_unittest.cc b/tests/unit/machine/all_machines_unittest.cc new file mode 100644 index 00000000000..0d3d009fae9 --- /dev/null +++ b/tests/unit/machine/all_machines_unittest.cc @@ -0,0 +1,384 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Heiko Strathmann + */ + +#include + +#include "utils/SGObjectIterator.h" +#include "utils/Utils.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + + +using namespace shogun; +using namespace std; + +void init_machine(CMachine* machine) +{ + if (auto* casted = dynamic_cast(machine)) + casted->set_kernel(new CGaussianKernel()); + if (auto* casted = dynamic_cast(machine)) + casted->set_distance(new CEuclideanDistance()); + + if (machine->has("max_iterations")) + machine->put("max_iterations", 50); +} + +CCrossValidation* generate_cv(CMachine* machine, const std::pair*, CDenseLabels*>& data) +{ + auto ss = new CCrossValidationSplitting(data.second, 5); + CEvaluation* ec = nullptr; + switch (machine->get_machine_problem_type()) + { + case PT_BINARY: + ec = new CAccuracyMeasure(); + break; + case PT_MULTICLASS: + ec = new CMulticlassAccuracy(); + break; + case PT_REGRESSION: + ec = new CMeanSquaredError(); + break; + default: + SG_SNOTIMPLEMENTED + break; + } + + auto cv = new CCrossValidation(machine, data.first, data.second, ss, ec); + cv->set_num_runs(3); + SG_REF(cv); + + return cv; +} + +std::pair*, CDenseLabels*> generate_data(const CMachine* machine) +{ + auto N = 50; + auto D = 3; + + std::mt19937_64 prng(57); + NormalDistribution randn; + UniformRealDistribution rand(0,1); + UniformIntDistribution randi(0,2); + + SGMatrix X(D,N); + for (auto i : range(D*N)) + X.matrix[i] = randn(prng); + auto features = new CDenseFeatures(X); + + SGVector y_reg(N); + SGVector y_binary(N); + SGVector y_mc(N); + + for (auto i : range(N)) + { + auto redux = linalg::mean(X.get_column(i)); + + y_reg[i] = redux + std::sin(redux) + 1; + y_mc[i] = redux<0 ? 0 : 1; + y_binary[i] = y_mc[i] * 2 -1; + + // noise + y_reg[i] += randn(prng)*0.1; + if (rand(prng)>0.1) + { + y_binary[i] *= (-1); + y_mc[i] = (int32_t(y_mc[i]) + randi(prng)) % 3; + } + } + + CDenseLabels* labels; + auto pt = machine->get_machine_problem_type(); + switch (pt) + { + case PT_BINARY: + case PT_CLASS: + { + labels = new CBinaryLabels(y_binary); + break; + } + + case PT_MULTICLASS: + { + labels = new CMulticlassLabels(y_mc); + break; + } + + case PT_REGRESSION: + labels = new CRegressionLabels(y_reg); + break; + + default: + SG_SERROR("Unsupported problem type: %d\n", pt); + } + + SG_REF(features) + SG_REF(labels) + return make_pair(features, labels); +} + +void serialize_machine(CMachine* machine, std::string& fname) +{ + auto fs = io::FileSystemRegistry::instance(); + std::string class_name = machine->get_name(); + fname = "shogun-unittest-AllMachines-trained_model_seiralization_consistency-" + class_name + + ".XXXXXX"; + generate_temp_filename(const_cast(fname.c_str())); + + SG_REF(machine); + EXPECT_FALSE(fs->file_exists(fname)); + std::unique_ptr file; + EXPECT_FALSE(fs->new_writable_file(fname, &file)); + auto fos = some(file.get()); + auto serializer = some(); + serializer->attach(fos); + serializer->write(wrap(machine)); +} + +CMachine* deserialize_machine(std::string fname) +{ + auto fs = io::FileSystemRegistry::instance(); + std::unique_ptr raf; + EXPECT_FALSE(fs->new_random_access_file(fname, &raf)); + auto fis = some(raf.get()); + auto deserializer = some(); + deserializer->attach(fis); + auto deser_obj = deserializer->read_object(); + bool delete_success = !fs->delete_file(fname); + EXPECT_TRUE(delete_success); + + return dynamic_cast(deser_obj.get()); +} + +// TODO, generate this automatically, like in trained_model_serialization +std::set all_machines = {"LibSVM", "Perceptron", "LibLinear", + "MulticlassLibLinear", "LinearRidgeRegression", "KNN", + "KernelRidgeRegression", "LibLinearRegression", "RandomForest"}; + +TEST(AllMachines, train_uninitialized) +{ + std::set ignores = {}; + for (auto obj : sg_object_iterator(all_machines).ignore(ignores)) + { + auto machine = obj->as(); + SCOPED_TRACE(machine->get_name()); + + EXPECT_THROW(machine->train(), ShogunException); + } +} + +TEST(AllMachines, train_execute) +{ + std::set ignores = {}; + for (auto obj : sg_object_iterator(all_machines).ignore(ignores)) + { + auto machine = obj->as(); + SCOPED_TRACE(machine->get_name()); + + init_machine(machine); + auto data = generate_data(machine); + machine->set_labels(data.second); + machine->train(data.first); + } +} + +TEST(AllMachines, train_thread_consistency) +{ + std::set ignores = { + "RandomForest" // segfault + }; + for (auto obj : sg_object_iterator(all_machines).ignore(ignores)) + { + auto machine = obj->as(); + SCOPED_TRACE(machine->get_name()); + + init_machine(machine); + auto machine2 = make_clone(machine); + + auto data = generate_data(machine); + + get_global_parallel()->set_num_threads(1); + machine->set_labels(data.second); + if (machine->has("seed")) + machine->put("seed", 1); + machine->train(data.first); + auto result_single = machine->apply(data.first); + + init_machine(machine); + get_global_parallel()->set_num_threads(4); + machine2->set_labels(data.second); + if (machine2->has("seed")) + machine2->put("seed", 1); + machine2->train(data.first); + auto result_multi = machine2->apply(data.first); + + EXPECT_TRUE(result_single->equals(result_multi)); + } +} + + +TEST(AllMachines, view_subsampling_consistency) +{ + std::set ignores = { + "RandomForest" // segfault + }; + for (auto obj : sg_object_iterator(all_machines).ignore(ignores)) + { + auto machine = obj->as(); + SCOPED_TRACE(machine->get_name()); + + init_machine(machine); + auto data = generate_data(machine); + + auto X = data.first->get_feature_matrix(); + auto y = data.second->get_labels(); + + SGVector subset = {1,3,4,6}; + + auto features_subset = view(data.first, subset); + auto labels_subset = view(data.second, subset); + + SGMatrix X_subsampled(X.num_rows, subset.size()); + SGVector y_subsampled(subset.size()); + + for (auto i : range(subset.size())) + { + memcpy(X_subsampled.get_column_vector(i), X.get_column_vector(subset[i]), X.num_rows * sizeof(decltype(X(0,0)))); + y_subsampled[i] = y[subset[i]]; + } + + auto features_subsampled = new CDenseFeatures(X_subsampled); + auto labels_subsampled = make_clone(data.second); + labels_subsampled->set_labels(y_subsampled); + + auto machine_subset = make_clone(machine); + auto machine_subsampled = make_clone(machine); + if (machine->has("seed")) + { + machine_subset->put("seed", 1); + machine_subsampled->put("seed", 1); + } + + machine_subset->set_labels(labels_subset); + machine_subset->train(features_subset); + machine_subsampled->set_labels(labels_subsampled); + machine_subsampled->train(features_subsampled); + + auto result_subset = machine_subset->apply(data.first); + auto result_subsampled = machine_subsampled->apply(data.first); + + EXPECT_TRUE(result_subset->equals(result_subsampled)); + } +} + +TEST(AllMachines, cv_thread_consistency) +{ + std::set ignores = { + "RandomForest" // segfault + }; + for (auto obj : sg_object_iterator(all_machines).ignore(ignores)) + { + auto machine = obj->as(); + SCOPED_TRACE(machine->get_name()); + + init_machine(machine); + auto data = generate_data(machine); + auto cv = generate_cv(machine, data); + auto cv2 = make_clone(cv); + + get_global_parallel()->set_num_threads(1); + cv->put("seed", 1); + auto result_single = cv->evaluate(); + + get_global_parallel()->set_num_threads(4); + cv2->put("seed", 1); + auto result_multi = cv2->evaluate(); + + EXPECT_TRUE(result_single->equals(result_multi)); + } +} + +TEST(AllMachines, train_apply_no_side_effects) +{ + std::set ignores = { + "RandomForest" // segfault + }; + for (auto obj : sg_object_iterator(all_machines).ignore(ignores)) + { + auto machine = obj->as(); + SCOPED_TRACE(machine->get_name()); + + init_machine(machine); + auto data = generate_data(machine); + + auto features_before = data.first->clone(); + auto labels_before = data.second->clone(); + + machine->set_labels(data.second); + machine->train(data.first); + machine->apply(data.first); + + auto features_after = data.first->clone(); + auto labels_after = data.second->clone(); + + EXPECT_TRUE(features_before->equals(features_after)); + EXPECT_TRUE(labels_before->equals(labels_after)); + } +} + +TEST(AllMachines, trained_model_serialization_consistency) +{ + std::set ignores = { + }; + for (auto obj : sg_object_iterator(all_machines).ignore(ignores)) + { + auto machine = obj->as(); + SCOPED_TRACE(machine->get_name()); + + init_machine(machine); + auto data = generate_data(machine); + + machine->set_labels(data.second); + machine->train(data.first); + + auto predictions = machine->apply(data.first); + + std::string filename; + serialize_machine(machine, filename); + auto deserialized_machine = deserialize_machine(filename); + + auto deserialized_predictions = deserialized_machine->apply(data.first); + + EXPECT_TRUE(predictions->equals(deserialized_predictions)); + } +} diff --git a/tests/unit/utils/SGObjectIterator.h b/tests/unit/utils/SGObjectIterator.h index 9e7d0aa3ca9..f14d9e8d5c2 100644 --- a/tests/unit/utils/SGObjectIterator.h +++ b/tests/unit/utils/SGObjectIterator.h @@ -5,8 +5,9 @@ #include #include #include +#include -namespace +namespace shogun { // to have a type for non-template SGObject classes struct untemplated_sgobject