2020-12-30 23:28:50 +01:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import os
|
2021-01-02 23:33:47 +01:00
|
|
|
import re
|
2020-12-31 03:18:48 +01:00
|
|
|
from tqdm import tqdm
|
2020-12-30 23:59:55 +01:00
|
|
|
from utilities import *
|
2020-12-30 23:28:50 +01:00
|
|
|
|
|
|
|
# Get arguments
|
|
|
|
parser = argparse.ArgumentParser(description="Save reddit posts to file.")
|
|
|
|
parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.")
|
2021-01-27 09:39:19 +01:00
|
|
|
if os.getenv("DOCKER", "0") != "1":
|
|
|
|
parser.add_argument("location", type=str, nargs=1, help="The path to save to.")
|
2023-06-18 19:23:42 +02:00
|
|
|
# Optional page size argument
|
|
|
|
parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.")
|
2020-12-30 23:28:50 +01:00
|
|
|
args = parser.parse_args()
|
|
|
|
mode = args.mode[0]
|
2023-06-18 19:23:42 +02:00
|
|
|
page_size = args.page_size[0]
|
2021-01-27 09:39:19 +01:00
|
|
|
location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0]
|
2020-12-30 23:28:50 +01:00
|
|
|
|
|
|
|
# Is location specified a directory?
|
|
|
|
if not os.path.isdir(location):
|
2020-12-30 23:59:55 +01:00
|
|
|
print(location, "is not a directory")
|
|
|
|
|
|
|
|
# Make a client object
|
|
|
|
client = make_client()
|
|
|
|
|
2020-12-31 01:33:30 +01:00
|
|
|
# Saved posts or upvoted posts?
|
2020-12-31 00:05:45 +01:00
|
|
|
if mode == "saved":
|
|
|
|
html_file = "saved.html"
|
|
|
|
get_posts = get_saved_posts
|
2021-01-03 01:20:15 +01:00
|
|
|
get_comments = get_saved_comments
|
2020-12-31 00:05:45 +01:00
|
|
|
else:
|
|
|
|
html_file = "upvoted.html"
|
|
|
|
get_posts = get_upvoted_posts
|
2021-01-03 01:20:15 +01:00
|
|
|
get_comments = lambda client: []
|
2020-12-31 00:05:45 +01:00
|
|
|
|
2021-01-02 23:46:36 +01:00
|
|
|
# Make directory for media and posts
|
2020-12-31 01:33:30 +01:00
|
|
|
if not os.path.exists(os.path.join(location, "media")):
|
|
|
|
os.mkdir(os.path.join(location, "media"))
|
2021-01-02 23:46:36 +01:00
|
|
|
if not os.path.exists(os.path.join(location, "posts")):
|
|
|
|
os.mkdir(os.path.join(location, "posts"))
|
2020-12-31 01:33:30 +01:00
|
|
|
|
2023-06-18 19:23:42 +02:00
|
|
|
# Get files to search through
|
|
|
|
print("Getting previously saved posts and comments...")
|
|
|
|
existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file)
|
2023-06-18 19:29:13 +02:00
|
|
|
print(len(existing_posts_html), "previous posts.")
|
|
|
|
print(len(existing_comments_html), "previous comments.")
|
2020-12-30 23:59:55 +01:00
|
|
|
|
2021-01-02 23:33:47 +01:00
|
|
|
# Get posts HTML
|
2021-01-03 01:20:15 +01:00
|
|
|
posts_html = []
|
2023-06-18 19:23:42 +02:00
|
|
|
posts = [p for p in get_posts(client) if p.id not in existing_ids]
|
2021-01-02 23:33:47 +01:00
|
|
|
if not posts:
|
2023-06-18 19:29:13 +02:00
|
|
|
print("No new posts")
|
2021-01-02 23:33:47 +01:00
|
|
|
else:
|
|
|
|
for post in tqdm(posts):
|
|
|
|
post_html = get_post_html(post)
|
|
|
|
media = save_media(post, location)
|
|
|
|
if media:
|
|
|
|
post_html = add_media_preview_to_html(post_html, media)
|
|
|
|
posts_html.append(post_html)
|
2021-01-02 23:46:36 +01:00
|
|
|
page_html = create_post_page_html(post, post_html)
|
2022-07-10 22:33:02 +02:00
|
|
|
with open(os.path.join(location, "posts", f"{post.id}.html"), "w", encoding="utf-8") as f:
|
2021-01-02 23:46:36 +01:00
|
|
|
f.write(page_html)
|
2021-01-03 01:20:15 +01:00
|
|
|
posts_html += existing_posts_html
|
|
|
|
|
|
|
|
# Get comments HTML
|
|
|
|
comments_html = []
|
2023-06-18 19:23:42 +02:00
|
|
|
comments = [c for c in get_comments(client) if c.id not in existing_ids]
|
2021-01-03 01:20:15 +01:00
|
|
|
if not comments:
|
2023-06-18 19:29:13 +02:00
|
|
|
print("No new comments")
|
2021-01-03 01:20:15 +01:00
|
|
|
else:
|
|
|
|
for comment in tqdm(comments):
|
|
|
|
comment_html = get_comment_html(comment)
|
|
|
|
media = save_media(post, location)
|
|
|
|
comments_html.append(comment_html)
|
|
|
|
comments_html += existing_comments_html
|
2020-12-30 23:59:55 +01:00
|
|
|
|
2023-06-18 19:23:42 +02:00
|
|
|
# Save overall HTML
|
|
|
|
print("Saving HTML...")
|
|
|
|
if page_size:
|
|
|
|
length = max(len(posts_html), len(comments_html))
|
|
|
|
page_count = (length // page_size) + 1
|
|
|
|
for i in range(page_count):
|
|
|
|
posts_on_page = posts_html[i*page_size:(i+1)*page_size]
|
|
|
|
comments_on_page = comments_html[i*page_size:(i+1)*page_size]
|
|
|
|
has_next = i < page_count - 1
|
|
|
|
save_html(posts_on_page, comments_on_page, location, html_file, i, has_next)
|
|
|
|
save_html(posts_html, comments_html, location, html_file, None, False)
|