From 7aabde7516d3dbb9b8e98677dc6f315ae20d336d Mon Sep 17 00:00:00 2001 From: Robert Date: Mon, 13 Jul 2020 20:45:25 +0200 Subject: [PATCH] Fixed https filtering --- .gitignore | 3 ++- cloud.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 511ae39..4630531 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ venv *.jpg *.png -*.json \ No newline at end of file +*.json +*.log \ No newline at end of file diff --git a/cloud.py b/cloud.py index e9e8f70..52d24b2 100644 --- a/cloud.py +++ b/cloud.py @@ -33,6 +33,8 @@ parser.add_argument("--color", action="store_true", help="Use mask as color mask") parser.add_argument("-N", metavar="max_words", type=int, help="Maximum number of words in WordCloud (Default 200)") +parser.add_argument("-v", "--verbose", action="store_true", + help="Dump comments to comments.log") args = parser.parse_args() @@ -50,11 +52,11 @@ if args.N is None: def fetch_comments(comment) -> list: + comment_body = re.sub(r'[\[\(]?https?:\/\/[0-9A-Za-z\/\?#\[\]\)@\.!$\&%\-+,;=]+', '', comment.body) if len(comment.replies) == 0: - c = re.sub(r'(\[text\]\()?https?://[0-9A-Za-z/\?#\[\]@\.!$\&%\-+,;=]+\)?', '', comment.body) - return [re.sub(r'https?://[0-9A-Za-z/\?#\[\]@\.!$\&%\-+,;=]+', '', c)] + return [comment_body] - raw_comments = [comment.body] + raw_comments = [comment_body] for comm in comment.replies: raw_comments.extend(fetch_comments(comm)) @@ -122,6 +124,10 @@ if args.color is True: if args.o is not None: wordcloud.to_file(args.o) +if args.verbose is True: + with open("comments.log", "w+", encoding="utf-8") as file: + file.writelines("%s\n" % comment for comment in comments) + plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off")