From 82ae100c9a30a82702223d641161b5bad54ecf93 Mon Sep 17 00:00:00 2001 From: Sam Ireland Date: Sun, 18 Jun 2023 18:23:42 +0100 Subject: [PATCH] Add option to save to multiple smaller HTML files --- README.md | 2 ++ html/saved.html | 8 ++++++ html/style.css | 7 +++++ html/upvoted.html | 8 ++++++ save.py | 58 +++++++++++++++------------------------- utilities.py | 67 ++++++++++++++++++++++++++++++++++++++++++++--- 6 files changed, 110 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index b3525fa..d133dc2 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ Each post will have its top-level comments saved, as well as each of their immed Linked media files (images, videos etc.) will be saved locally where possible, though imgur is currently not well supported in all cases. +If you want to also break the resultant HTML file into multiple files (as browsers struggle to display enormous HTML files) you can add the `--page-size 100` argument (replacing 100 with whatever the posts-per page you want is). + ## Use with Docker Rather than installing dependencies locally, you can use docker to create a local image and use that instead. First build the image: diff --git a/html/saved.html b/html/saved.html index fea7a01..7a23074 100644 --- a/html/saved.html +++ b/html/saved.html @@ -6,6 +6,10 @@ +

Saved Posts

@@ -14,5 +18,9 @@

Saved Comments

+ \ No newline at end of file diff --git a/html/style.css b/html/style.css index 54c8a3e..c485536 100644 --- a/html/style.css +++ b/html/style.css @@ -83,6 +83,13 @@ h1 { padding: 8px 16px; } +.links { + padding: 12px 16px 0px; + font-size: 12px; + display: flex; + gap: 8px; +} + .post, .comment { border-top: 1px solid #f0f0f0; padding: 12px 16px; diff --git a/html/upvoted.html b/html/upvoted.html index 09c0dd2..3606696 100644 --- a/html/upvoted.html +++ b/html/upvoted.html @@ -6,7 +6,15 @@ +

Upvoted Posts

+ \ No newline at end of file diff --git a/save.py b/save.py index 37a1c75..48da390 100644 --- a/save.py +++ b/save.py @@ -9,11 +9,13 @@ from utilities import * # Get arguments parser = argparse.ArgumentParser(description="Save reddit posts to file.") parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.") - if os.getenv("DOCKER", "0") != "1": parser.add_argument("location", type=str, nargs=1, help="The path to save to.") +# Optional page size argument +parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.") args = parser.parse_args() mode = args.mode[0] +page_size = args.page_size[0] location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0] # Is location specified a directory? @@ -39,20 +41,15 @@ if not os.path.exists(os.path.join(location, "media")): if not os.path.exists(os.path.join(location, "posts")): os.mkdir(os.path.join(location, "posts")) -# Are there any posts already? -post_ids, existing_posts_html = [], [] -if os.path.exists(os.path.join(location, html_file)): - with open(os.path.join(location, html_file), encoding="utf-8") as f: - current_html = f.read() - post_ids = re.findall(r'id="(.+?)"', current_html) - existing_posts_html = re.findall( - r'(
<\/div>)', - current_html - ) +# Get files to search through +print("Getting previously saved posts and comments...") +existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file) +print(len(existing_posts_html), "previous posts saved.") +print(len(existing_comments_html), "previous comments saved.") # Get posts HTML posts_html = [] -posts = [p for p in get_posts(client) if p.id not in post_ids] +posts = [p for p in get_posts(client) if p.id not in existing_ids] if not posts: print("No new saved posts") else: @@ -67,20 +64,9 @@ else: f.write(page_html) posts_html += existing_posts_html -# Are there any comments already? -comment_ids, existing_comments_html = [], [] -if os.path.exists(os.path.join(location, html_file)): - with open(os.path.join(location, html_file), encoding="utf-8") as f: - current_html = f.read() - comment_ids = re.findall(r'id="(.+?)"', current_html) - existing_comments_html = re.findall( - r'(
<\/div>)', - current_html - ) - # Get comments HTML comments_html = [] -comments = [c for c in get_comments(client) if c.id not in comment_ids] +comments = [c for c in get_comments(client) if c.id not in existing_ids] if not comments: print("No new saved comments") else: @@ -90,16 +76,14 @@ else: comments_html.append(comment_html) comments_html += existing_comments_html -# Save HTML -with open(os.path.join("html", html_file), encoding="utf-8") as f: - html = f.read() -with open(os.path.join("html", "style.css"), encoding="utf-8") as f: - html = html.replace("", f"") -with open(os.path.join("html", "main.js"), encoding="utf-8") as f: - html = html.replace("", f"") -html = html.replace("", "\n".join(posts_html)) -html = html.replace("", "\n".join(comments_html)) -with open(os.path.join(location, html_file), "w", encoding="utf-8") as f: - f.write(html) - - +# Save overall HTML +print("Saving HTML...") +if page_size: + length = max(len(posts_html), len(comments_html)) + page_count = (length // page_size) + 1 + for i in range(page_count): + posts_on_page = posts_html[i*page_size:(i+1)*page_size] + comments_on_page = comments_html[i*page_size:(i+1)*page_size] + has_next = i < page_count - 1 + save_html(posts_on_page, comments_on_page, location, html_file, i, has_next) +save_html(posts_html, comments_html, location, html_file, None, False) \ No newline at end of file diff --git a/utilities.py b/utilities.py index b6fbfc7..cbfe849 100644 --- a/utilities.py +++ b/utilities.py @@ -32,6 +32,39 @@ def make_client(): ) +def get_previous(location, html_file): + html_files = [f for f in os.listdir(location) if f.endswith(".html")] + pattern = html_file.replace(".html", r"\.(\d+)?\.html") + matches = [re.match(pattern, f) for f in html_files] + matches = [m[0] for m in matches if m] + matches.sort(key=lambda x: int(x.split(".")[1])) + existing_ids = [] + existing_posts_html = [] + existing_comments_html = [] + if html_file in html_files: matches.append(html_file) + for match in matches: + with open(os.path.join(location, match), encoding="utf-8") as f: + current_html = f.read() + for id in re.findall(r'id="(.+?)"', current_html): + if id not in existing_ids: + existing_ids.append(id) + posts = re.findall( + r'(
<\/div>)', + current_html + ) + comments = re.findall( + r'(
<\/div>)', + current_html + ) + for post in posts: + if post not in existing_posts_html: + existing_posts_html.append(post) + for comment in comments: + if comment not in existing_comments_html: + existing_comments_html.append(comment) + return existing_ids, existing_posts_html, existing_comments_html + + def get_saved_posts(client): """Gets a list of posts that the user has saved.""" @@ -100,7 +133,10 @@ def save_media(post, location): # Can the media be obtained directly? if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS: filename = f"{readable_name}_{post.id}.{extension}" - response = requests.get(post.url) + try: + response = requests.get(post.url) + except: + return media_type = response.headers.get("Content-Type", "") if media_type.startswith("image") or media_type.startswith("video"): with open(os.path.join(location, "media", filename), "wb") as f: @@ -138,7 +174,9 @@ def save_media(post, location): direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}' direct_url = direct_url.replace("i.imgur.com", "imgur.com") direct_url = direct_url.replace("m.imgur.com", "imgur.com") - response = requests.get(direct_url) + try: + response = requests.get(direct_url) + except: continue if response.status_code == 200: filename = f"{readable_name}_{post.id}.{extension}" with open(os.path.join(location, "media", filename), "wb") as f: @@ -158,7 +196,8 @@ def save_media(post, location): try: ydl.download([url]) except: - pass + os.chdir(current) + return for f in os.listdir(os.path.join(location, "media")): if f.startswith(f"{readable_name}_{post.id}"): return f @@ -237,3 +276,25 @@ def get_comment_html(comment, children=True, op=None): children_html.append(get_comment_html(child, children=False, op=op)) html = html.replace("", "\n".join(children_html)) return html + + +def save_html(posts, comments, location, html_file, page, has_next): + with open(os.path.join("html", html_file), encoding="utf-8") as f: + html = f.read() + with open(os.path.join("html", "style.css"), encoding="utf-8") as f: + html = html.replace("", f"") + with open(os.path.join("html", "main.js"), encoding="utf-8") as f: + html = html.replace("", f"") + if page == 0 or page is None: + html = html.replace("Previous", "") + else: + html = html.replace(".p.html", f".{page-1}.html") + if not has_next or page is None: + html = html.replace("Next", "") + else: + html = html.replace(".n.html", f".{page+1}.html") + html = html.replace("", "\n".join(posts)) + html = html.replace("", "\n".join(comments)) + file_name = html_file if page is None else html_file.replace(".html", f".{page}.html") + with open(os.path.join(location, file_name), "w", encoding="utf-8") as f: + f.write(html)