-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Lib/model accessor #1
base: main
Are you sure you want to change the base?
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems great! Some minor changes but nothing really important.
Could you also update the Makefile by adding your package please?
|
||
|
||
```python | ||
from dku_model_accessor import get_model_handler, ModelAccessor |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
from dku_model_accessor import get_model_handler, ModelAccessor | |
import dataiku | |
from dku_model_accessor import get_model_handler, ModelAccessor |
```python | ||
from dku_model_accessor import get_model_handler, ModelAccessor | ||
|
||
model_id = 'XQyU0TO0' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
model_id = 'XQyU0TO0' | |
model_id = 'YOUR_MODEL_ID' |
Maybe use a more explicit ID
class DkuModelAccessorConstants(object): | ||
MODEL_ID = 'model_id' | ||
VERSION_ID = 'version_id' | ||
REGRRSSION_TYPE = 'REGRESSION' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
REGRRSSION_TYPE = 'REGRESSION' | |
REGRESSION_TYPE = 'REGRESSION' |
typo
def get_original_test_df(self, limit=DkuModelAccessorConstants.MAX_NUM_ROW): | ||
try: | ||
full_test_df = self.model_handler.get_test_df()[0] | ||
test_df = full_test_df[:limit] | ||
logger.info('Loading {}/{} rows of the original test set'.format(len(test_df), len(full_test_df))) | ||
return test_df | ||
except Exception as e: | ||
logger.warning('Can not retrieve original test set: {}. The plugin will take the whole original dataset.'.format(e)) | ||
full_test_df = self.model_handler.get_full_df()[0] | ||
test_df = full_test_df[:limit] | ||
logger.info('Loading {}/{} rows of the whole original test set'.format(len(test_df), len(full_test_df))) | ||
return test_df |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
def get_original_test_df(self, limit=DkuModelAccessorConstants.MAX_NUM_ROW): | |
try: | |
full_test_df = self.model_handler.get_test_df()[0] | |
test_df = full_test_df[:limit] | |
logger.info('Loading {}/{} rows of the original test set'.format(len(test_df), len(full_test_df))) | |
return test_df | |
except Exception as e: | |
logger.warning('Can not retrieve original test set: {}. The plugin will take the whole original dataset.'.format(e)) | |
full_test_df = self.model_handler.get_full_df()[0] | |
test_df = full_test_df[:limit] | |
logger.info('Loading {}/{} rows of the whole original test set'.format(len(test_df), len(full_test_df))) | |
return test_df | |
def get_original_test_df(self, limit=DkuModelAccessorConstants.MAX_NUM_ROW): | |
try: | |
full_test_df = self.model_handler.get_test_df()[0] | |
except Exception as e: | |
logger.warning('Can not retrieve original test set: {}. The plugin will take the whole original dataset.'.format(e)) | |
full_test_df = self.model_handler.get_full_df()[0] | |
test_df = full_test_df[:limit] | |
logger.info('Loading {}/{} rows of the original test set'.format(len(test_df), len(full_test_df))) | |
return test_df |
Code repeated, I would only try...catch on what can really raise an exception.
logger.info('Fitting surrogate model ...') | ||
surrogate_model = SurrogateModel(self.get_prediction_type()) | ||
original_test_df = self.get_original_test_df() | ||
predictions_on_original_test_df = self.get_predictor().predict(original_test_df) | ||
surrogate_df = original_test_df[self.get_selected_features()] | ||
surrogate_df[DkuModelAccessorConstants.SURROGATE_TARGET] = predictions_on_original_test_df['prediction'] | ||
surrogate_model.fit(surrogate_df, DkuModelAccessorConstants.SURROGATE_TARGET) | ||
feature_names = surrogate_model.get_features() | ||
feature_importances = surrogate_model.clf.feature_importances_ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe it would be better to wrap into a new method fit_surrogate_model()
def get_selected_features(self): | ||
""" | ||
Return only features used in the model | ||
""" | ||
selected_features = [] | ||
for feat, feat_info in self.get_per_feature().items(): | ||
if feat_info.get('role') == 'INPUT': | ||
selected_features.append(feat) | ||
return selected_features | ||
|
||
def get_selected_and_rejected_features(self): | ||
""" | ||
Return all features in the input dataset except the target | ||
""" | ||
selected_features = [] | ||
for feat, feat_info in self.get_per_feature().items(): | ||
if feat_info.get('role') in ['INPUT', 'REJECT']: | ||
selected_features.append(feat) | ||
return selected_features |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
def get_selected_features(self): | |
""" | |
Return only features used in the model | |
""" | |
selected_features = [] | |
for feat, feat_info in self.get_per_feature().items(): | |
if feat_info.get('role') == 'INPUT': | |
selected_features.append(feat) | |
return selected_features | |
def get_selected_and_rejected_features(self): | |
""" | |
Return all features in the input dataset except the target | |
""" | |
selected_features = [] | |
for feat, feat_info in self.get_per_feature().items(): | |
if feat_info.get('role') in ['INPUT', 'REJECT']: | |
selected_features.append(feat) | |
return selected_features | |
def get_features_by_status(self, status): | |
return [feat for feat, feat_info in self.get_per_feature().items() if feat_info.get('role') in status] | |
def get_selected_features(self): | |
""" | |
Return only features used in the model | |
""" | |
return self.get_features_by_status(['INPUT']) | |
def get_selected_and_rejected_features(self): | |
""" | |
Return all features in the input dataset except the target | |
""" | |
return self.get_features_by_status(['INPUT', 'REJECT']) |
DRY
for algorithm in ALGORITHMS_WITH_VARIABLE_IMPORTANCE: | ||
if isinstance(algo, algorithm): | ||
return True | ||
elif predictor.params.modeling_params.get('algorithm') in [DkuModelAccessorConstants.DKU_XGBOOST_CLASSIF, DkuModelAccessorConstants.DKU_XGBOOST_REGRESSION]: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would place this array in a new var tree_based_algo = [DkuModelAccessorConstants.DKU_XGBOOST_CLASSIF, DkuModelAccessorConstants.DKU_XGBOOST_REGRESSION]
for PEP8 and ease of addition
No description provided.