%pip install mlrun scikit-learn pandas numpy
from os import path
import mlrun
project_name_base = 'pmt-app'
project_name, artifact_path = mlrun.set_environment(project=project_name_base, user_project=True)
print(f'Project name: {project_name}')
print(f'Artifact path: {artifact_path}')
from os import path
import numpy as np
import pandas as pd
import datetime as dt
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from pickle import dumps
from sklearn.ensemble import RandomForestClassifier
def clean_yes_no_column(serie, train=True, train_mean=None):
_serie = serie.apply(lambda x: 0 if x=="no" else x)
_serie = _serie.apply(lambda x: float(x) if x!="yes" else x)
if train:
mean_value = _serie[_serie != "yes"].mean()
else:
mean_value = train_mean
return _serie.apply(lambda x: mean_value if x=="yes" else x)
def fetch_data(context : MLClientCtx, pmt_records_path: DataItem):
pmt_records_dataset = pmt_records_path.as_df()
target_path = path.join(context.artifact_path, 'data')
context.logger.info('Saving datasets to {} ...'.format(target_path))
# Store the data sets in your artifacts database
context.log_dataset('pmt-app-dataset', df=pmt_records_dataset, format='csv',
index=False, artifact_path=target_path)
def transform_dataset(context : MLClientCtx, pmt_records_path: DataItem):
context.logger.info('Begin datasets transform')
train_data = pmt_records_path.as_df()
fill_dict = {"v2a1": train_data.v2a1.median(), #Monthly rent payment
"v18q1": 0, #number of tablets household owns
"rez_esc": train_data.rez_esc.median(), #Years behind in school
"meaneduc": train_data.meaneduc.median(), #average years of education for adults (18+)
}
train_data = train_data.fillna(fill_dict)
train_data.SQBmeaned = np.sqrt(train_data.meaneduc)
train_data.dependency = clean_yes_no_column(train_data.dependency)
train_data.edjefe = clean_yes_no_column(train_data.edjefe)
train_data.edjefa = clean_yes_no_column(train_data.edjefa)
# Save dataset to artifact
target_path = path.join(context.artifact_path, 'data')
context.log_dataset('pmt-app-dataset-transformed', df=train_data, artifact_path=target_path, format='csv')
context.logger.info('End dataset transform')
def train_model(context: MLClientCtx, input_ds: DataItem):
context.logger.info('Begin training')
train_data = input_ds.as_df();
feature_cols = [x for x in train_data.columns if x not in ["Target", "Id", "idhogar"]]
X = train_data[feature_cols]
y = train_data.Target
model = RandomForestClassifier()
model.fit(X, y)
context.log_model('ModelPMT',
body=dumps(model),
artifact_path=context.artifact_subpath("models"),
model_file="ModelPMT.pkl")
context.logger.info('End training')
# mlrun: end-code
pmt_records_csv_path = 'https://pmt-data.herokuapp.com/train.csv'
model_pmt_func = mlrun.code_to_function(name='model_pmt',
kind='job',
image='mlrun/mlrun',
requirements=['scikit-learn', 'numpy','pandas'])
fetch_data
Locally¶We can test out code locally, by calling the function with local
parameter set to True
fetch_data_run = model_pmt_func.run(handler='fetch_data',
inputs={'pmt_records_path': pmt_records_csv_path},
local=True)
fetch_data_run.outputs
from mlrun.platforms import auto_mount
model_pmt_func.apply(auto_mount())
model_pmt_func.deploy()
fetch_data_run = model_pmt_func.run(name='fetch_data',
handler='fetch_data',
inputs={'pmt_records_path': pmt_records_csv_path})
fetch_data_run.outputs['pmt-app-dataset']
transform_dataset_run = model_pmt_func.run(name='transform_dataset',
handler='transform_dataset',
inputs={'pmt_records_path': fetch_data_run.outputs['pmt-app-dataset']})
transform_dataset_run.outputs
train_model_run = model_pmt_func.run(name='train_model',
handler='train_model',
inputs={'input_ds': transform_dataset_run.outputs['pmt-app-dataset-transformed']})
train_model_run.outputs['ModelPMT']
from mlrun import import_function
from mlrun.platforms import auto_mount
serve = import_function('hub://v2_model_server').apply(auto_mount())
model_name='PMTModel'
serve.add_model(model_name, model_path=train_model_run.outputs['ModelPMT'])
addr = serve.deploy()
# Test the model
import json
inputs = [[190000,0,3,0,1,1,0,0,0,1,1,0,0,0,0,1,1,1,1,10,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,10,0,10,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,43,100,1849,1,100,0,1,0,100,1849]]
my_data = json.dumps({'inputs': inputs})
serve.invoke(f'v2/models/PMTModel/infer', my_data)