Added better filtering

2020-07-13 22:15:28 +02:00 · 2020-07-13 22:15:28 +02:00 · 9c6f9fcc47
commit 9c6f9fcc47
parent 7aabde7516
5 changed files with 556476 additions and 12 deletions
--- a/cloud.py
+++ b/cloud.py
@ -2,6 +2,7 @@ import praw
 import json
 import argparse
 import re
+import csv
 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
 from stop_words import safe_get_stop_words
 import matplotlib.pyplot as plt
@ -35,6 +36,14 @@ parser.add_argument("-N", metavar="max_words", type=int,
                    help="Maximum number of words in WordCloud (Default 200)")
 parser.add_argument("-v", "--verbose", action="store_true",
                    help="Dump comments to comments.log")
+parser.add_argument("-w", metavar="wordlist", type=str,
+                    help="The wordlist to use for weighting (Default english)")
+parser.add_argument("-min", metavar="min_freq", type=float,
+                    help="The minimum frequency a word needs to have to be counted in % (Default 0)")
+parser.add_argument("-boost", metavar="freq_boost", type=float,
+                    help="The boost a word that isn't in the wordlist gets (Default 1)")
+parser.add_argument("-blow", metavar="freq_blow", type=float,
+                    help="The \"anti-boost\" a word that is in the wordlist gets (Default 1)")

 args = parser.parse_args()

@ -50,6 +59,18 @@ if args.b is None:
 if args.N is None:
    args.N = 200

+if args.w is None:
+    args.w = "english"
+
+if args.min is None:
+    args.min = 0
+
+if args.boost is None:
+    args.boost = 1
+
+if args.blow is None:
+    args.blow = 1
+

 def fetch_comments(comment) -> list:
    comment_body = re.sub(r'[\[\(]?https?:\/\/[0-9A-Za-z\/\?#\[\]\)@\.!$\&%\-+,;=]+', '', comment.body)
@ -106,17 +127,45 @@ if args.c is not None:
    cw = int(args.c[0])
    cc = args.c[1]

-wordcloud = WordCloud(font_path="ARIALUNI.TTF",
-                      max_words=args.N,
-                      collocations=False,
-                      scale=args.s,
-                      stopwords=stopwords,
-                      mask=mask,
-                      background_color=args.b,
-                      mode="RGB",
-                      contour_width=cw,
-                      contour_color=cc
-                      ).generate(' '.join(comments))
+wc_obj = WordCloud(font_path="ARIALUNI.TTF",
+                   max_words=args.N,
+                   collocations=False,
+                   scale=args.s,
+                   stopwords=stopwords,
+                   mask=mask,
+                   background_color=args.b,
+                   mode="RGB",
+                   contour_width=cw,
+                   contour_color=cc
+                   )
+
+words = wc_obj.process_text(' '.join(comments))
+
+with open(f"wordlists/{args.w}.csv") as list:
+    lookup = csv.reader(list, delimiter=";")
+    lookup_dict = {}
+    for row in lookup:
+        lookup_dict[row[1]] = int(row[2].replace(" ", ""))
+
+out_dict = {}
+words_total = 0
+max_freq = 0
+for (word, freq) in words.items():
+    words_total += freq
+    if freq > max_freq:
+        max_freq = freq
+
+for (word, freq) in words.items():
+    if freq / words_total >= args.min:
+        out_freq = freq
+        if word in lookup_dict:
+            out_freq *= (max_freq / lookup_dict[word]) / args.blow
+        else:
+            out_freq *= args.boost
+
+        out_dict[word] = out_freq
+
+wordcloud = wc_obj.generate_from_frequencies(out_dict)

 if args.color is True:
    wordcloud.recolor(color_func=colors)
@ -131,4 +180,4 @@ if args.verbose is True:
 plt.imshow(wordcloud, interpolation="bilinear")
 plt.axis("off")

-plt.show()
+plt.show()