github.com/wlattner/mlserver@v0.0.0-20141113171038-895f261d2bfd/fit.py

github.com/wlattner/mlserver@v0.0.0-20141113171038-895f261d2bfd/fit.py (about)

     1  import os
     2  import json
     3  import datetime
     4  
     5  from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
     6  from sklearn.linear_model import LogisticRegression
     7  from sklearn.feature_extraction import DictVectorizer
     8  from sklearn.pipeline import Pipeline
     9  from sklearn.externals import joblib
    10  from sklearn.cross_validation import cross_val_score
    11  from sklearn.metrics import confusion_matrix
    12  
    13  def fit(X, Y):
    14    models = {
    15      'LogisticRegression': LogisticRegression(),
    16      'GradientBoostingClassifier': GradientBoostingClassifier(n_estimators=150),
    17      'RandomForestClassifier': RandomForestClassifier(n_estimators=150)
    18    }
    19  
    20    best_score = 0
    21    best_model = ''
    22    for model in models:
    23      vec = DictVectorizer(sparse=False)
    24      clf = models[model]
    25      pl = Pipeline([('vec', vec), ('clf', clf)])
    26  
    27      #TODO: grid search for model params
    28      scores = cross_val_score(pl, X, Y, n_jobs=3)
    29      if scores.mean() > best_score:
    30        best_score = scores.mean()
    31        best_model = model
    32  
    33    # retrain best model with all data
    34    vec = DictVectorizer(sparse=False)
    35    clf = models[best_model]
    36    pl = Pipeline([('vec', vec), ('clf', clf)])
    37    pl.fit(X, Y)
    38    pl.score_ = best_score  # report cv score
    39    return pl
    40  
    41  def save(path, model_id, model):
    42  	fname = model_id + '.pkl'
    43  	if not os.path.exists(path):
    44  		os.makedirs(path)
    45  	joblib.dump(model, os.path.join(path, fname))
    46  
    47  def save_metadata(path, model_id, model_name, model, X, Y):
    48  	Y_hat = model.predict(X)
    49  
    50  	labels = [l for l in model.named_steps['clf'].classes_]
    51  
    52  	cm = confusion_matrix(Y, Y_hat, labels=labels)
    53  	# this is an insane dict comprehension, need to encode the val as a float, json will not encode 0
    54  	cm_dict = {str(labels[inx]): {str(labels[c]):float(val) for c, val in enumerate(row)} for inx, row in enumerate(cm)}
    55  
    56  	model_data = {
    57  		"model_id": model_id,
    58  		"metadata": {
    59  			"name": model_name,
    60  			"created_at": datetime.datetime.utcnow().isoformat('T') + 'Z'
    61  		},
    62  		"performance" : {
    63  			"algorithm": model.named_steps['clf'].__class__.__name__,
    64  			"score": model.score_,
    65  			"confusion_matrix": cm_dict
    66  		}
    67  	}
    68  
    69  	json.dump(model_data, open(os.path.join(path, model_id + '.json'), 'w'))
    70  
    71  
    72  if __name__ == "__main__":
    73  	import sys
    74  
    75  	data = json.load(open(sys.argv[2]))
    76  	model_save_path = sys.argv[1]
    77  	model_id = os.path.basename(model_save_path)
    78  
    79  	model = fit(data['data'], data['labels'])
    80  	save(model_save_path, model_id, model)
    81  	save_metadata(model_save_path, model_id, data['name'], model, data['data'], data['labels'])