github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A pipeline that uses RunInference API on a regression about housing prices. 19 20 This example uses the japanese housing data from kaggle. 21 https://www.kaggle.com/datasets/nishiodens/japan-real-estate-transaction-prices 22 23 Since the data has missing fields, this example illustrates how to split 24 data and assign it to the models that are trained on different subsets of 25 features. The predictions are then recombined. 26 27 In order to set this example up, you will need two things. 28 1. Build models (or use ours) and reference those via the model directory. 29 2. Download the data from kaggle and host it. 30 """ 31 32 import argparse 33 import os 34 from typing import Iterable 35 36 import pandas 37 38 import apache_beam as beam 39 from apache_beam.io.filesystems import FileSystems 40 from apache_beam.ml.inference.base import RunInference 41 from apache_beam.ml.inference.sklearn_inference import ModelFileType 42 from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerPandas 43 from apache_beam.options.pipeline_options import PipelineOptions 44 from apache_beam.options.pipeline_options import SetupOptions 45 from apache_beam.runners.runner import PipelineResult 46 47 # yapf: disable 48 MODELS = [ 49 { 50 'name': 'all_features', 51 'required_features': [ 52 'Area', 53 'Year', 54 'MinTimeToNearestStation', 55 'MaxTimeToNearestStation', 56 'TotalFloorArea', 57 'Frontage', 58 'Breadth', 59 'BuildingYear'] 60 }, { 61 'name': 'floor_area', 62 'required_features': ['Area', 'Year', 'TotalFloorArea'] 63 }, { 64 'name': 'stations', 65 'required_features': [ 66 'Area', 67 'Year', 68 'MinTimeToNearestStation', 69 'MaxTimeToNearestStation'] 70 }, { 71 'name': 'no_features', 72 'required_features': ['Area', 'Year'] 73 } 74 ] 75 # yapf: enable 76 77 78 def sort_by_features(dataframe, max_size): 79 """ Returns an index to a model, based on available data.""" 80 for i, model in enumerate(MODELS): 81 required_features = dataframe[model['required_features']] 82 # A model can only make a prediction if all required features 83 # are present. 84 # required_features is 2D single row, so all() must be called twice. 85 if required_features.notnull().all().all(): 86 return i 87 return -1 88 89 90 class LoadDataframe(beam.DoFn): 91 def process(self, file_name: str) -> Iterable[pandas.DataFrame]: 92 """ Loads data files as a pandas dataframe.""" 93 file = FileSystems.open(file_name, 'rb') 94 dataframe = pandas.read_csv(file) 95 for i in range(dataframe.shape[0]): 96 yield dataframe.iloc[[i]] 97 98 99 def report_predictions(prediction_result): 100 true_result = prediction_result.example['TradePrice'].values[0] 101 inference = prediction_result.inference 102 return 'True Price %.0f, Predicted Price %.0f' % (true_result, inference) 103 104 105 def parse_known_args(argv): 106 """Parses args for the workflow.""" 107 parser = argparse.ArgumentParser() 108 parser.add_argument( 109 '--input', 110 dest='input', 111 required=True, 112 help='A single or comma-separated list of files or uris.') 113 parser.add_argument( 114 '--model_path', 115 dest='model_path', 116 required=True, 117 help='A path from where all models can be read.') 118 parser.add_argument( 119 '--output', 120 dest='output', 121 required=True, 122 help='Path to save output predictions.') 123 return parser.parse_known_args(argv) 124 125 126 def inference_transform(model_name, model_path): 127 """Returns a RunInference transform.""" 128 model_filename = model_path + model_name + '.pickle' 129 model_loader = SklearnModelHandlerPandas( 130 model_file_type=ModelFileType.PICKLE, model_uri=model_filename) 131 transform_name = 'RunInference ' + model_name 132 return transform_name >> RunInference(model_loader) 133 134 135 def run( 136 argv=None, save_main_session=True, test_pipeline=None) -> PipelineResult: 137 """Entry point. Defines and runs the pipeline.""" 138 known_args, pipeline_args = parse_known_args(argv) 139 pipeline_options = PipelineOptions(pipeline_args) 140 pipeline_options.view_as(SetupOptions).save_main_session = save_main_session 141 requirements_dir = os.path.dirname(os.path.realpath(__file__)) 142 # Pin to the version that we trained the model on. 143 # Sklearn doesn't guarantee compatability between versions. 144 pipeline_options.view_as( 145 SetupOptions 146 ).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt' 147 148 pipeline = test_pipeline 149 if not test_pipeline: 150 pipeline = beam.Pipeline(options=pipeline_options) 151 152 # Input may be a single file or a comma separated list of files. 153 file_names = pipeline | 'FileNames' >> beam.Create( 154 known_args.input.split(',')) 155 loaded_data = file_names | beam.ParDo(LoadDataframe()) 156 157 # Some examples don't have all features. Pipelines 158 # that expect those fields will fail. There are many ways to deal with 159 # missing data. This example illustrates how to assign predictions to 160 # different models depending upon what data is available. 161 [all, floor_area, stations, no_features] = ( 162 loaded_data 163 | 'Partition' >> beam.Partition(sort_by_features, len(MODELS))) 164 165 model_path = known_args.model_path 166 prediction_1 = all | inference_transform('all_features', model_path) 167 prediction_2 = floor_area | inference_transform('floor_area', model_path) 168 prediction_3 = stations | inference_transform('stations', model_path) 169 prediction_4 = no_features | inference_transform('no_features', model_path) 170 171 all_predictions = (prediction_1, prediction_2, prediction_3, prediction_4) 172 flattened_predictions = all_predictions | 'Flatten' >> beam.Flatten() 173 prediction_report = ( 174 flattened_predictions 175 | 'AllPredictions' >> beam.Map(report_predictions)) 176 _ = prediction_report | "WriteOutput" >> beam.io.WriteToText( 177 known_args.output, append_trailing_newlines=True, shard_name_template='') 178 179 result = pipeline.run() 180 result.wait_until_finish() 181 return result 182 183 184 if __name__ == '__main__': 185 run()