github.com/pachyderm/pachyderm@v1.13.4/examples/spouts/EmailSentimentAnalyzer/sentimentalist.py

github.com/pachyderm/pachyderm@v1.13.4/examples/spouts/EmailSentimentAnalyzer/sentimentalist.py (about)

     1  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
     2  import os
     3  import email
     4  import mimetypes
     5  from email.policy import default
     6  import argparse
     7  
     8  default_input_repo = os.getenv('INPUT_REPO', '/pfs/imap_spout')
     9  default_negatives_dir = os.getenv('NEGATIVES_DIRECTORY', '/pfs/out/negatives')
    10  default_positives_dir = os.getenv('POSITIVES_DIRECTORY', '/pfs/out/positives')
    11  default_sentiment_header = os.getenv('SENTIMENT_HEADER', 'X-Sentiment-Rating')
    12  
    13  def gauge_email_sentiment(file, input_dir, positives, negatives, header, analyzer):
    14      with open(os.path.join(input_dir,file), 'rb') as email_fp:
    15          msg = email.message_from_binary_file(email_fp, policy=default)
    16      msg_body = msg.get_body(preferencelist=('plain'))
    17      # We include the subject in scoring the message
    18      score = analyzer.polarity_scores("{} {}".format(msg['subject'], msg_body.get_content()))
    19      # This would score without the subject
    20      # score = analyzer.polarity_scores(msg_body.get_content())
    21      # Put the scores in the envelope for later use
    22      msg.add_header(header, str(score))
    23      # Decide where to put the message
    24      if score['compound'] < 0:
    25          output_path = negatives
    26      else:
    27          output_path = positives
    28      # Write the message out, using the same filename
    29      with open(os.path.join(output_path,file), "wb") as out_fp:
    30          out_fp.write(msg.as_bytes())
    31  
    32  def main():
    33          parser = argparse.ArgumentParser(description='Unpack each of the email messages found in a directory (default /pfs/imap_spout), grab the subject and plain text, rate it for sentiments, add a header (default X-Sentiment-Rating) with the rating, and sort into one of two directories based on positive (default /pfs/out/positive) or negative sentiment (default /pfs/out/negative) ratings.')
    34  
    35          parser.add_argument('-i', '--input_repo', required=False,
    36                              help="""The directory where the emails to be processed are to be found, one email per file. This overrides the default and the environment variable INPUT_REPO.""",
    37                              default=default_input_repo)
    38          parser.add_argument('-n', '--negatives_dir', required=False,
    39                              help="""Where the negative emails go. This overrides the default and the environment variable NEGATIVES_DIRECTORY.""", default=default_negatives_dir)
    40          parser.add_argument('-p', '--positives_dir', required=False,
    41                              help="""Where the positive emails go. This overrides the default and the environment variable POSITIVES_DIRECTORY.""", default=default_positives_dir)
    42          parser.add_argument('-s', '--sentiment_header', required=False,
    43                              help="""The header that gets the full sentiment rating on the output email. This overrides the default and the environment variable SENTIMENT_HEADER.""", default=default_sentiment_header)
    44          args = parser.parse_args()
    45          analyzer = SentimentIntensityAnalyzer()
    46  
    47          try:
    48              os.mkdir(args.negatives_dir)
    49          except FileExistsError:
    50              pass
    51      
    52          try:
    53              os.mkdir(args.positives_dir)
    54          except FileExistsError:
    55              pass
    56  
    57          for dirpath, dirs, files in os.walk(args.input_repo):
    58              for file in files:
    59                  gauge_email_sentiment(file, dirpath, args.positives_dir, args.negatives_dir, args.sentiment_header, analyzer)
    60  
    61  
    62  if __name__== "__main__":
    63    main()