github.com/pachyderm/pachyderm@v1.13.4/examples/ml/housing-prices/regression.py (about) 1 import argparse 2 import os 3 from os import path 4 import numpy as np 5 import pandas as pd 6 import matplotlib.pyplot as plt 7 import seaborn as sns 8 import joblib 9 from utils import plot_learning_curve 10 11 from sklearn.model_selection import ShuffleSplit 12 from sklearn import datasets, ensemble, linear_model 13 from sklearn.model_selection import learning_curve 14 from sklearn.model_selection import ShuffleSplit 15 from sklearn.model_selection import cross_val_score 16 17 parser = argparse.ArgumentParser(description="Structured data regression") 18 parser.add_argument("--input", 19 type=str, 20 help="csv file with all examples") 21 parser.add_argument("--target-col", 22 type=str, 23 help="column with target values") 24 parser.add_argument("--output", 25 metavar="DIR", 26 default='./output', 27 help="output directory") 28 29 def load_data(input_csv, target_col): 30 # Load the Boston housing dataset 31 data = pd.read_csv(input_csv, header=0) 32 targets = data[target_col] 33 features = data.drop(target_col, axis = 1) 34 print("Dataset has {} data points with {} variables each.".format(*data.shape)) 35 return data, features, targets 36 37 def create_pairplot(data): 38 plt.clf() 39 # Calculate and show pairplot 40 sns.pairplot(data, height=2.5) 41 plt.tight_layout() 42 43 def create_corr_matrix(data): 44 plt.clf() 45 # Calculate and show correlation matrix 46 sns.set() 47 corr = data.corr() 48 49 # Generate a mask for the upper triangle 50 mask = np.triu(np.ones_like(corr, dtype=np.bool)) 51 52 # Generate a custom diverging colormap 53 cmap = sns.diverging_palette(220, 10, as_cmap=True) 54 55 # Draw the heatmap with the mask and correct aspect ratio 56 sns_plot = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, 57 square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .5}) 58 59 def train_model(features, targets): 60 # Train a Random Forest Regression model 61 reg = ensemble.RandomForestRegressor(random_state=1) 62 scores = cross_val_score(reg, features, targets, cv=10) 63 print("Score: {:2f} (+/- {:2f})".format(scores.mean(), scores.std() * 2)) 64 return reg 65 66 def create_learning_curve(estimator, features, targets): 67 plt.clf() 68 69 title = "Learning Curves (Random Forest Regressor)" 70 cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) 71 plot_learning_curve(estimator, title, features, targets, 72 ylim=(0.5, 1.01), cv=cv, n_jobs=4) 73 74 def main(): 75 args = parser.parse_args() 76 if os.path.isfile(args.input): 77 input_files = [args.input] 78 else: # Directory 79 for dirpath, dirs, files in os.walk(args.input): 80 input_files = [ os.path.join(dirpath, filename) for filename in files if filename.endswith('.csv') ] 81 print("Datasets: {}".format(input_files)) 82 os.makedirs(args.output, exist_ok=True) 83 84 for filename in input_files: 85 86 experiment_name = os.path.basename(os.path.splitext(filename)[0]) 87 # Data loading and Exploration 88 data, features, targets = load_data(filename, args.target_col) 89 create_pairplot(data) 90 plt.savefig(path.join(args.output,experiment_name + '_pairplot.png')) 91 create_corr_matrix(data) 92 plt.savefig(path.join(args.output, experiment_name + '_corr_matrix.png')) 93 94 # Fit model 95 reg = train_model(features, targets) 96 create_learning_curve(reg, features, targets) 97 plt.savefig(path.join(args.output, experiment_name + '_cv_reg_output.png')) 98 99 # Save model 100 joblib.dump(reg, path.join(args.output,experiment_name + '_model.sav')) 101 102 if __name__ == "__main__": 103 main()