import praw import json import argparse import re import csv from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator from stop_words import safe_get_stop_words import matplotlib.pyplot as plt import numpy as np from PIL import Image, ImageColor parser = argparse.ArgumentParser(description="Fetches comments from a reddit post and makes a word cloud") parser.add_argument("--sub", action="store_true", help="Treats the ID as the name of a sub") parser.add_argument("id", type=str, help="The ID of the reddit post") parser.add_argument("-n", metavar="limit", type=int, help="The amount of times \"More comments...\" is resolved. (Default: all)") parser.add_argument("-p", metavar="posts", type=int, help="Number of posts to fetch (Only in sub mode) (Default: 25)") parser.add_argument("-l", nargs="+", help="The languages to add stopwords for") parser.add_argument("-o", metavar="out", type=str, help="Output file") parser.add_argument("-s", metavar="scale", type=int, help="The scale of the wordcloud") parser.add_argument("-m", metavar="mask", type=str, help="The mask that is applied to the wordcloud") parser.add_argument("-b", metavar="background", type=str, help="Background color of the wordcloud") parser.add_argument("-c", metavar=("cw", "cc"), type=str, nargs=2, help="Width and color of contour") parser.add_argument("--color", action="store_true", help="Use mask as color mask") parser.add_argument("-N", metavar="max_words", type=int, help="Maximum number of words in WordCloud (Default 200)") parser.add_argument("-v", "--verbose", action="store_true", help="Dump comments to comments.log") parser.add_argument("-w", metavar="wordlist", type=str, help="The wordlist to use for weighting (Default english)") parser.add_argument("-min", metavar="min_freq", type=float, help="The minimum frequency a word needs to have to be counted in percent (Default 0)") parser.add_argument("-boost", metavar="freq_boost", type=float, help="The boost a word that isn't in the wordlist gets (Default 1)") parser.add_argument("-blow", metavar="freq_blow", type=float, help="The \"anti-boost\" a word that is in the wordlist gets (Default 1)") parser.add_argument("--top", action="store_true", help="Use Top posts instead of Hot posts") args = parser.parse_args() if args.p is None: args.p = 25 if args.s is None: args.s = 1 if args.b is None: args.b = "black" if args.N is None: args.N = 200 if args.w is None: args.w = "english" if args.min is None: args.min = 0 if args.boost is None: args.boost = 1 if args.blow is None: args.blow = 1 def fetch_comments(comment) -> list: comment_body = re.sub(r'[\[\(]?https?:\/\/[0-9A-Za-z\/\?#\[\]\)@\.!$\&%\-+,;=]+', '', comment.body) comment_body = comment_body.replace("[deleted]", "") comment_body = comment_body.replace("[removed]", "") if len(comment.replies) == 0: return [comment_body] raw_comments = [comment_body] for comm in comment.replies: raw_comments.extend(fetch_comments(comm)) return raw_comments with open("config.json") as file: settings = json.load(file) reddit = praw.Reddit(client_id=settings["client_id"], client_secret=settings["secret"], user_agent="Windows10:RWC:1.0") if args.sub: if args.top: posts = reddit.subreddit(args.id).top("all") else: posts = reddit.subreddit(args.id).hot(limit=args.p) else: posts = [reddit.submission(id=args.id)] i = 1 comments = [] posts = list(posts) length = len(posts) for post in posts: print(f"\rFetching comments... {i}/{length} ", end=" ", flush=True) post.comments.replace_more(limit=args.n) for top_level_comment in post.comments: comments.extend(fetch_comments(top_level_comment)) i += 1 print(f"Done! Processed {len(comments)} comments") stopwords = set(STOPWORDS) if args.l is not None: for language in args.l: stopwords.update(safe_get_stop_words(language.lower())) mask = None colors = None if args.m is not None: print("Creating mask...", end=" ", flush=True) mask = np.array(Image.open(args.m).convert("RGB")) colors = ImageColorGenerator(mask) print("Done!") cw = 0 cc = None if args.c is not None: cw = int(args.c[0]) cc = args.c[1] wc_obj = WordCloud(font_path="ARIALUNI.TTF", max_words=args.N, collocations=False, scale=args.s, stopwords=stopwords, mask=mask, background_color=args.b, mode="RGB", contour_width=cw, contour_color=cc ) print("Processing words...", end=" ", flush=True) words = wc_obj.process_text(' '.join(comments)) with open(f"wordlists/{args.w}.csv") as list: lookup = csv.reader(list, delimiter=";") lookup_dict = {} for row in lookup: lookup_dict[row[1]] = int(row[2].replace(" ", "")) out_dict = {} words_total = 0 max_freq = 0 for (word, freq) in words.items(): words_total += freq if freq > max_freq: max_freq = freq for (word, freq) in words.items(): if freq / words_total >= args.min: out_freq = freq if word in lookup_dict: out_freq *= (max_freq / lookup_dict[word]) / args.blow else: out_freq *= args.boost out_dict[word] = out_freq print("Done!") print("Create cloud...", end=" ", flush=True) wordcloud = wc_obj.generate_from_frequencies(out_dict) print("Done!") if args.color is True: wordcloud.recolor(color_func=colors) if args.o is not None: wordcloud.to_file(args.o) if args.verbose is True: with open("comments.log", "w+", encoding="utf-8") as file: file.writelines("%s\n" % comment for comment in comments) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show()