github.com/pachyderm/pachyderm@v1.13.4/examples/ml/neon/inference/auto_inference.py

github.com/pachyderm/pachyderm@v1.13.4/examples/ml/neon/inference/auto_inference.py (about)

     1  #!/usr/bin/env python
     2  """
     3  Example that does inference on an LSTM networks for amazon review analysis
     4  
     5  $ python examples/imdb/auto_inference.py --model_weights imdb.p --vocab_file imdb.vocab 
     6      --review_files /pfs/reviews --output_dir /pfs/out
     7  
     8  """
     9  
    10  from __future__ import print_function
    11  from future import standard_library
    12  standard_library.install_aliases()  # triggers E402, hence noqa below
    13  from builtins import input  # noqa
    14  import numpy as np  # noqa
    15  from neon.backends import gen_backend  # noqa
    16  from neon.initializers import Uniform, GlorotUniform  # noqa
    17  from neon.layers import LSTM, Affine, Dropout, LookupTable, RecurrentSum  # noqa
    18  from neon.models import Model  # noqa
    19  from neon.transforms import Logistic, Tanh, Softmax  # noqa
    20  from neon.util.argparser import NeonArgparser, extract_valid_args  # noqa
    21  from neon.util.compat import pickle  # noqa
    22  from neon.data.text_preprocessing import clean_string  # noqa
    23  import os
    24  
    25  # parse the command line arguments
    26  parser = NeonArgparser(__doc__)
    27  parser.add_argument('--model_weights', required=True,
    28                      help='pickle file of trained weights')
    29  parser.add_argument('--vocab_file', required=True,
    30                      help='vocabulary file')
    31  parser.add_argument('--review_files', required=True,
    32                      help='directory containing reviews in text files')
    33  parser.add_argument('--output_dir', required=True,
    34                      help='directory where results will be saved')
    35  args = parser.parse_args()
    36  
    37  
    38  # hyperparameters from the reference
    39  batch_size = 1
    40  clip_gradients = True
    41  gradient_limit = 5
    42  vocab_size = 20000
    43  sentence_length = 128
    44  embedding_dim = 128
    45  hidden_size = 128
    46  reset_cells = True
    47  num_epochs = args.epochs
    48  
    49  # setup backend
    50  be = gen_backend(**extract_valid_args(args, gen_backend))
    51  be.bsz = 1
    52  
    53  
    54  # define same model as in train
    55  init_glorot = GlorotUniform()
    56  init_emb = Uniform(low=-0.1 / embedding_dim, high=0.1 / embedding_dim)
    57  nclass = 2
    58  layers = [
    59      LookupTable(vocab_size=vocab_size, embedding_dim=embedding_dim, init=init_emb,
    60                  pad_idx=0, update=True),
    61      LSTM(hidden_size, init_glorot, activation=Tanh(),
    62           gate_activation=Logistic(), reset_cells=True),
    63      RecurrentSum(),
    64      Dropout(keep=0.5),
    65      Affine(nclass, init_glorot, bias=init_glorot, activation=Softmax())
    66  ]
    67  
    68  
    69  # load the weights
    70  print("Initialized the models - ")
    71  model_new = Model(layers=layers)
    72  print("Loading the weights from {0}".format(args.model_weights))
    73  
    74  model_new.load_params(args.model_weights)
    75  model_new.initialize(dataset=(sentence_length, batch_size))
    76  
    77  # setup buffers before accepting reviews
    78  xdev = be.zeros((sentence_length, 1), dtype=np.int32)  # bsz is 1, feature size
    79  xbuf = np.zeros((1, sentence_length), dtype=np.int32)
    80  oov = 2
    81  start = 1
    82  index_from = 3
    83  pad_char = 0
    84  vocab, rev_vocab = pickle.load(open(args.vocab_file, 'rb'))
    85  
    86  # walk over the reviews in the text files, making inferences
    87  for dirpath, dirs, files in os.walk(args.review_files):
    88      for file in files:
    89          with open(os.path.join(dirpath, file), 'r') as myfile:
    90                  data=myfile.read()
    91  
    92                  # clean the input
    93                  tokens = clean_string(data).strip().split()
    94  
    95                  # check for oov and add start
    96                  sent = [len(vocab) + 1 if t not in vocab else vocab[t] for t in tokens]
    97                  sent = [start] + [w + index_from for w in sent]
    98                  sent = [oov if w >= vocab_size else w for w in sent]
    99  
   100                  # pad sentences
   101                  xbuf[:] = 0
   102                  trunc = sent[-sentence_length:]
   103                  xbuf[0, -len(trunc):] = trunc
   104                  xdev[:] = xbuf.T.copy()
   105                  y_pred = model_new.fprop(xdev, inference=True)  # inference flag dropout
   106  
   107                  with open(os.path.join(args.output_dir, file), "w") as output_file:
   108                          output_file.write("Pred - {0}\n".format(y_pred.get().T))