Added better filtering
This commit is contained in:
parent
7aabde7516
commit
9c6f9fcc47
5 changed files with 556476 additions and 12 deletions
73
cloud.py
73
cloud.py
|
@ -2,6 +2,7 @@ import praw
|
|||
import json
|
||||
import argparse
|
||||
import re
|
||||
import csv
|
||||
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
|
||||
from stop_words import safe_get_stop_words
|
||||
import matplotlib.pyplot as plt
|
||||
|
@ -35,6 +36,14 @@ parser.add_argument("-N", metavar="max_words", type=int,
|
|||
help="Maximum number of words in WordCloud (Default 200)")
|
||||
parser.add_argument("-v", "--verbose", action="store_true",
|
||||
help="Dump comments to comments.log")
|
||||
parser.add_argument("-w", metavar="wordlist", type=str,
|
||||
help="The wordlist to use for weighting (Default english)")
|
||||
parser.add_argument("-min", metavar="min_freq", type=float,
|
||||
help="The minimum frequency a word needs to have to be counted in % (Default 0)")
|
||||
parser.add_argument("-boost", metavar="freq_boost", type=float,
|
||||
help="The boost a word that isn't in the wordlist gets (Default 1)")
|
||||
parser.add_argument("-blow", metavar="freq_blow", type=float,
|
||||
help="The \"anti-boost\" a word that is in the wordlist gets (Default 1)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -50,6 +59,18 @@ if args.b is None:
|
|||
if args.N is None:
|
||||
args.N = 200
|
||||
|
||||
if args.w is None:
|
||||
args.w = "english"
|
||||
|
||||
if args.min is None:
|
||||
args.min = 0
|
||||
|
||||
if args.boost is None:
|
||||
args.boost = 1
|
||||
|
||||
if args.blow is None:
|
||||
args.blow = 1
|
||||
|
||||
|
||||
def fetch_comments(comment) -> list:
|
||||
comment_body = re.sub(r'[\[\(]?https?:\/\/[0-9A-Za-z\/\?#\[\]\)@\.!$\&%\-+,;=]+', '', comment.body)
|
||||
|
@ -106,17 +127,45 @@ if args.c is not None:
|
|||
cw = int(args.c[0])
|
||||
cc = args.c[1]
|
||||
|
||||
wordcloud = WordCloud(font_path="ARIALUNI.TTF",
|
||||
max_words=args.N,
|
||||
collocations=False,
|
||||
scale=args.s,
|
||||
stopwords=stopwords,
|
||||
mask=mask,
|
||||
background_color=args.b,
|
||||
mode="RGB",
|
||||
contour_width=cw,
|
||||
contour_color=cc
|
||||
).generate(' '.join(comments))
|
||||
wc_obj = WordCloud(font_path="ARIALUNI.TTF",
|
||||
max_words=args.N,
|
||||
collocations=False,
|
||||
scale=args.s,
|
||||
stopwords=stopwords,
|
||||
mask=mask,
|
||||
background_color=args.b,
|
||||
mode="RGB",
|
||||
contour_width=cw,
|
||||
contour_color=cc
|
||||
)
|
||||
|
||||
words = wc_obj.process_text(' '.join(comments))
|
||||
|
||||
with open(f"wordlists/{args.w}.csv") as list:
|
||||
lookup = csv.reader(list, delimiter=";")
|
||||
lookup_dict = {}
|
||||
for row in lookup:
|
||||
lookup_dict[row[1]] = int(row[2].replace(" ", ""))
|
||||
|
||||
out_dict = {}
|
||||
words_total = 0
|
||||
max_freq = 0
|
||||
for (word, freq) in words.items():
|
||||
words_total += freq
|
||||
if freq > max_freq:
|
||||
max_freq = freq
|
||||
|
||||
for (word, freq) in words.items():
|
||||
if freq / words_total >= args.min:
|
||||
out_freq = freq
|
||||
if word in lookup_dict:
|
||||
out_freq *= (max_freq / lookup_dict[word]) / args.blow
|
||||
else:
|
||||
out_freq *= args.boost
|
||||
|
||||
out_dict[word] = out_freq
|
||||
|
||||
wordcloud = wc_obj.generate_from_frequencies(out_dict)
|
||||
|
||||
if args.color is True:
|
||||
wordcloud.recolor(color_func=colors)
|
||||
|
@ -131,4 +180,4 @@ if args.verbose is True:
|
|||
plt.imshow(wordcloud, interpolation="bilinear")
|
||||
plt.axis("off")
|
||||
|
||||
plt.show()
|
||||
plt.show()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue