github.com/pachyderm/pachyderm@v1.13.4/examples/spouts/EmailSentimentAnalyzer/sentimentalist.py (about) 1 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 2 import os 3 import email 4 import mimetypes 5 from email.policy import default 6 import argparse 7 8 default_input_repo = os.getenv('INPUT_REPO', '/pfs/imap_spout') 9 default_negatives_dir = os.getenv('NEGATIVES_DIRECTORY', '/pfs/out/negatives') 10 default_positives_dir = os.getenv('POSITIVES_DIRECTORY', '/pfs/out/positives') 11 default_sentiment_header = os.getenv('SENTIMENT_HEADER', 'X-Sentiment-Rating') 12 13 def gauge_email_sentiment(file, input_dir, positives, negatives, header, analyzer): 14 with open(os.path.join(input_dir,file), 'rb') as email_fp: 15 msg = email.message_from_binary_file(email_fp, policy=default) 16 msg_body = msg.get_body(preferencelist=('plain')) 17 # We include the subject in scoring the message 18 score = analyzer.polarity_scores("{} {}".format(msg['subject'], msg_body.get_content())) 19 # This would score without the subject 20 # score = analyzer.polarity_scores(msg_body.get_content()) 21 # Put the scores in the envelope for later use 22 msg.add_header(header, str(score)) 23 # Decide where to put the message 24 if score['compound'] < 0: 25 output_path = negatives 26 else: 27 output_path = positives 28 # Write the message out, using the same filename 29 with open(os.path.join(output_path,file), "wb") as out_fp: 30 out_fp.write(msg.as_bytes()) 31 32 def main(): 33 parser = argparse.ArgumentParser(description='Unpack each of the email messages found in a directory (default /pfs/imap_spout), grab the subject and plain text, rate it for sentiments, add a header (default X-Sentiment-Rating) with the rating, and sort into one of two directories based on positive (default /pfs/out/positive) or negative sentiment (default /pfs/out/negative) ratings.') 34 35 parser.add_argument('-i', '--input_repo', required=False, 36 help="""The directory where the emails to be processed are to be found, one email per file. This overrides the default and the environment variable INPUT_REPO.""", 37 default=default_input_repo) 38 parser.add_argument('-n', '--negatives_dir', required=False, 39 help="""Where the negative emails go. This overrides the default and the environment variable NEGATIVES_DIRECTORY.""", default=default_negatives_dir) 40 parser.add_argument('-p', '--positives_dir', required=False, 41 help="""Where the positive emails go. This overrides the default and the environment variable POSITIVES_DIRECTORY.""", default=default_positives_dir) 42 parser.add_argument('-s', '--sentiment_header', required=False, 43 help="""The header that gets the full sentiment rating on the output email. This overrides the default and the environment variable SENTIMENT_HEADER.""", default=default_sentiment_header) 44 args = parser.parse_args() 45 analyzer = SentimentIntensityAnalyzer() 46 47 try: 48 os.mkdir(args.negatives_dir) 49 except FileExistsError: 50 pass 51 52 try: 53 os.mkdir(args.positives_dir) 54 except FileExistsError: 55 pass 56 57 for dirpath, dirs, files in os.walk(args.input_repo): 58 for file in files: 59 gauge_email_sentiment(file, dirpath, args.positives_dir, args.negatives_dir, args.sentiment_header, analyzer) 60 61 62 if __name__== "__main__": 63 main()