github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A pipeline that uses RunInference API on a regression about housing prices.
    19  
    20  This example uses the japanese housing data from kaggle.
    21  https://www.kaggle.com/datasets/nishiodens/japan-real-estate-transaction-prices
    22  
    23  Since the data has missing fields, this example illustrates how to split
    24  data and assign it to the models that are trained on different subsets of
    25  features. The predictions are then recombined.
    26  
    27  In order to set this example up, you will need two things.
    28  1. Build models (or use ours) and reference those via the model directory.
    29  2. Download the data from kaggle and host it.
    30  """
    31  
    32  import argparse
    33  import os
    34  from typing import Iterable
    35  
    36  import pandas
    37  
    38  import apache_beam as beam
    39  from apache_beam.io.filesystems import FileSystems
    40  from apache_beam.ml.inference.base import RunInference
    41  from apache_beam.ml.inference.sklearn_inference import ModelFileType
    42  from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerPandas
    43  from apache_beam.options.pipeline_options import PipelineOptions
    44  from apache_beam.options.pipeline_options import SetupOptions
    45  from apache_beam.runners.runner import PipelineResult
    46  
    47  # yapf: disable
    48  MODELS = [
    49    {
    50      'name': 'all_features',
    51        'required_features': [
    52          'Area',
    53          'Year',
    54          'MinTimeToNearestStation',
    55          'MaxTimeToNearestStation',
    56          'TotalFloorArea',
    57          'Frontage',
    58          'Breadth',
    59          'BuildingYear']
    60    }, {
    61    'name': 'floor_area',
    62    'required_features': ['Area', 'Year', 'TotalFloorArea']
    63    }, {
    64      'name': 'stations',
    65      'required_features': [
    66          'Area',
    67          'Year',
    68          'MinTimeToNearestStation',
    69          'MaxTimeToNearestStation']
    70    }, {
    71      'name': 'no_features',
    72      'required_features': ['Area', 'Year']
    73    }
    74  ]
    75  # yapf: enable
    76  
    77  
    78  def sort_by_features(dataframe, max_size):
    79    """ Returns an index to a model, based on available data."""
    80    for i, model in enumerate(MODELS):
    81      required_features = dataframe[model['required_features']]
    82      # A model can only make a prediction if all required features
    83      # are present.
    84      # required_features is 2D single row, so all() must be called twice.
    85      if required_features.notnull().all().all():
    86        return i
    87    return -1
    88  
    89  
    90  class LoadDataframe(beam.DoFn):
    91    def process(self, file_name: str) -> Iterable[pandas.DataFrame]:
    92      """ Loads data files as a pandas dataframe."""
    93      file = FileSystems.open(file_name, 'rb')
    94      dataframe = pandas.read_csv(file)
    95      for i in range(dataframe.shape[0]):
    96        yield dataframe.iloc[[i]]
    97  
    98  
    99  def report_predictions(prediction_result):
   100    true_result = prediction_result.example['TradePrice'].values[0]
   101    inference = prediction_result.inference
   102    return 'True Price %.0f, Predicted Price %.0f' % (true_result, inference)
   103  
   104  
   105  def parse_known_args(argv):
   106    """Parses args for the workflow."""
   107    parser = argparse.ArgumentParser()
   108    parser.add_argument(
   109        '--input',
   110        dest='input',
   111        required=True,
   112        help='A single or comma-separated list of files or uris.')
   113    parser.add_argument(
   114        '--model_path',
   115        dest='model_path',
   116        required=True,
   117        help='A path from where all models can be read.')
   118    parser.add_argument(
   119        '--output',
   120        dest='output',
   121        required=True,
   122        help='Path to save output predictions.')
   123    return parser.parse_known_args(argv)
   124  
   125  
   126  def inference_transform(model_name, model_path):
   127    """Returns a RunInference transform."""
   128    model_filename = model_path + model_name + '.pickle'
   129    model_loader = SklearnModelHandlerPandas(
   130        model_file_type=ModelFileType.PICKLE, model_uri=model_filename)
   131    transform_name = 'RunInference ' + model_name
   132    return transform_name >> RunInference(model_loader)
   133  
   134  
   135  def run(
   136      argv=None, save_main_session=True, test_pipeline=None) -> PipelineResult:
   137    """Entry point. Defines and runs the pipeline."""
   138    known_args, pipeline_args = parse_known_args(argv)
   139    pipeline_options = PipelineOptions(pipeline_args)
   140    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
   141    requirements_dir = os.path.dirname(os.path.realpath(__file__))
   142    # Pin to the version that we trained the model on.
   143    # Sklearn doesn't guarantee compatability between versions.
   144    pipeline_options.view_as(
   145        SetupOptions
   146    ).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt'
   147  
   148    pipeline = test_pipeline
   149    if not test_pipeline:
   150      pipeline = beam.Pipeline(options=pipeline_options)
   151  
   152    # Input may be a single file or a comma separated list of files.
   153    file_names = pipeline | 'FileNames' >> beam.Create(
   154        known_args.input.split(','))
   155    loaded_data = file_names | beam.ParDo(LoadDataframe())
   156  
   157    # Some examples don't have all features. Pipelines
   158    # that expect those fields will fail. There are many ways to deal with
   159    # missing data. This example illustrates how to assign predictions to
   160    # different models depending upon what data is available.
   161    [all, floor_area, stations, no_features] = (
   162        loaded_data
   163        | 'Partition' >> beam.Partition(sort_by_features, len(MODELS)))
   164  
   165    model_path = known_args.model_path
   166    prediction_1 = all | inference_transform('all_features', model_path)
   167    prediction_2 = floor_area | inference_transform('floor_area', model_path)
   168    prediction_3 = stations | inference_transform('stations', model_path)
   169    prediction_4 = no_features | inference_transform('no_features', model_path)
   170  
   171    all_predictions = (prediction_1, prediction_2, prediction_3, prediction_4)
   172    flattened_predictions = all_predictions | 'Flatten' >> beam.Flatten()
   173    prediction_report = (
   174        flattened_predictions
   175        | 'AllPredictions' >> beam.Map(report_predictions))
   176    _ = prediction_report | "WriteOutput" >> beam.io.WriteToText(
   177        known_args.output, append_trailing_newlines=True, shard_name_template='')
   178  
   179    result = pipeline.run()
   180    result.wait_until_finish()
   181    return result
   182  
   183  
   184  if __name__ == '__main__':
   185    run()