From 82ae100c9a30a82702223d641161b5bad54ecf93 Mon Sep 17 00:00:00 2001
From: Sam Ireland <mail@samireland.com>
Date: Sun, 18 Jun 2023 18:23:42 +0100
Subject: [PATCH] Add option to save to multiple smaller HTML files

---
 README.md         |  2 ++
 html/saved.html   |  8 ++++++
 html/style.css    |  7 +++++
 html/upvoted.html |  8 ++++++
 save.py           | 58 +++++++++++++++-------------------------
 utilities.py      | 67 ++++++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 110 insertions(+), 40 deletions(-)
diff --git a/README.md b/README.md
index b3525fa..d133dc2 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,8 @@ Each post will have its top-level comments saved, as well as each of their immed
 
 Linked media files (images, videos etc.) will be saved locally where possible, though imgur is currently not well supported in all cases.
 
+If you want to also break the resultant HTML file into multiple files (as browsers struggle to display enormous HTML files) you can add the `--page-size 100` argument (replacing 100 with whatever the posts-per page you want is).
+
 ## Use with Docker
 
 Rather than installing dependencies locally, you can use docker to create a local image and use that instead. First build the image:
diff --git a/html/saved.html b/html/saved.html
index fea7a01..7a23074 100644
--- a/html/saved.html
+++ b/html/saved.html
@@ -6,6 +6,10 @@
 <script></script>
 </head>
 <body>
+<div class="links">
+<a href="saved.p.html">Previous</a>
+<a href="saved.n.html">Next</a>
+</div>
 <section class="posts-section">
 <h1>Saved Posts</h1>
 <!--posts-->
@@ -14,5 +18,9 @@
 <h1>Saved Comments</h1>
 <!--comments-->
 </section>
+<div class="links">
+<a href="saved.n.html">Previous</a>
+<a href="saved.n.html">Next</a>
+</div>
 </body>
 </html>
\ No newline at end of file
diff --git a/html/style.css b/html/style.css
index 54c8a3e..c485536 100644
--- a/html/style.css
+++ b/html/style.css
@@ -83,6 +83,13 @@ h1 {
     padding: 8px 16px;
 }
 
+.links {
+    padding: 12px 16px 0px;
+    font-size: 12px;
+    display: flex;
+    gap: 8px;
+}
+
 .post, .comment {
     border-top: 1px solid #f0f0f0;
     padding: 12px 16px;
diff --git a/html/upvoted.html b/html/upvoted.html
index 09c0dd2..3606696 100644
--- a/html/upvoted.html
+++ b/html/upvoted.html
@@ -6,7 +6,15 @@
 <script></script>
 </head>
 <body>
+<div class="links">
+<a href="upvoted.p.html">Previous</a>
+<a href="upvoted.n.html">Next</a>
+</div>
 <h1>Upvoted Posts</h1>
 <!--posts-->
+<div class="links">
+<a href="upvoted.p.html">Previous</a>
+<a href="upvoted.n.html">Next</a>
+</div>
 </body>
 </html>
\ No newline at end of file
diff --git a/save.py b/save.py
index 37a1c75..48da390 100644
--- a/save.py
+++ b/save.py
@@ -9,11 +9,13 @@ from utilities import *
 # Get arguments
 parser = argparse.ArgumentParser(description="Save reddit posts to file.")
 parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.")
-
 if os.getenv("DOCKER", "0") != "1":
     parser.add_argument("location", type=str, nargs=1, help="The path to save to.")
+# Optional page size argument
+parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.")
 args = parser.parse_args()
 mode = args.mode[0]
+page_size = args.page_size[0]
 location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0]
 
 # Is location specified a directory?
@@ -39,20 +41,15 @@ if not os.path.exists(os.path.join(location, "media")):
 if not os.path.exists(os.path.join(location, "posts")):
     os.mkdir(os.path.join(location, "posts"))
 
-# Are there any posts already?
-post_ids, existing_posts_html = [], []
-if os.path.exists(os.path.join(location, html_file)):
-    with open(os.path.join(location, html_file), encoding="utf-8") as f:
-        current_html = f.read()
-        post_ids = re.findall(r'id="(.+?)"', current_html)
-        existing_posts_html = re.findall(
-            r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
-            current_html
-        )
+# Get files to search through
+print("Getting previously saved posts and comments...")
+existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file)
+print(len(existing_posts_html), "previous posts saved.")
+print(len(existing_comments_html), "previous comments saved.")
 
 # Get posts HTML
 posts_html = []
-posts = [p for p in get_posts(client) if p.id not in post_ids]
+posts = [p for p in get_posts(client) if p.id not in existing_ids]
 if not posts:
     print("No new saved posts")
 else:
@@ -67,20 +64,9 @@ else:
             f.write(page_html)
 posts_html += existing_posts_html
 
-# Are there any comments already?
-comment_ids, existing_comments_html = [], []
-if os.path.exists(os.path.join(location, html_file)):
-    with open(os.path.join(location, html_file), encoding="utf-8") as f:
-        current_html = f.read()
-        comment_ids = re.findall(r'id="(.+?)"', current_html)
-        existing_comments_html = re.findall(
-            r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
-            current_html
-        )
-
 # Get comments HTML
 comments_html = []
-comments = [c for c in get_comments(client) if c.id not in comment_ids]
+comments = [c for c in get_comments(client) if c.id not in existing_ids]
 if not comments:
     print("No new saved comments")
 else:
@@ -90,16 +76,14 @@ else:
         comments_html.append(comment_html)
 comments_html += existing_comments_html
 
-# Save HTML
-with open(os.path.join("html", html_file), encoding="utf-8") as f:
-    html = f.read()
-with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
-    html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
-with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
-    html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
-html = html.replace("<!--posts-->", "\n".join(posts_html))
-html = html.replace("<!--comments-->", "\n".join(comments_html))
-with open(os.path.join(location, html_file), "w", encoding="utf-8") as f:
-    f.write(html)
-
-
+# Save overall HTML
+print("Saving HTML...")
+if page_size:
+    length = max(len(posts_html), len(comments_html))
+    page_count = (length // page_size) + 1
+    for i in range(page_count):
+        posts_on_page = posts_html[i*page_size:(i+1)*page_size]
+        comments_on_page = comments_html[i*page_size:(i+1)*page_size]
+        has_next = i < page_count - 1
+        save_html(posts_on_page, comments_on_page, location, html_file, i, has_next)
+save_html(posts_html, comments_html, location, html_file, None, False)
\ No newline at end of file
diff --git a/utilities.py b/utilities.py
index b6fbfc7..cbfe849 100644
--- a/utilities.py
+++ b/utilities.py
@@ -32,6 +32,39 @@ def make_client():
     )
 
 
+def get_previous(location, html_file):
+    html_files = [f for f in os.listdir(location) if f.endswith(".html")]
+    pattern = html_file.replace(".html", r"\.(\d+)?\.html")
+    matches = [re.match(pattern, f) for f in html_files]
+    matches = [m[0] for m in matches if m]
+    matches.sort(key=lambda x: int(x.split(".")[1]))
+    existing_ids = []
+    existing_posts_html = []
+    existing_comments_html = []
+    if html_file in html_files: matches.append(html_file)
+    for match in matches:
+        with open(os.path.join(location, match), encoding="utf-8") as f:
+            current_html = f.read()
+            for id in re.findall(r'id="(.+?)"', current_html):
+                if id not in existing_ids:
+                    existing_ids.append(id)
+            posts = re.findall(
+                r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
+                current_html
+            )
+            comments = re.findall(
+                r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
+                current_html
+            )
+            for post in posts:
+                if post not in existing_posts_html:
+                    existing_posts_html.append(post)
+            for comment in comments:
+                if comment not in existing_comments_html:
+                    existing_comments_html.append(comment)
+    return existing_ids, existing_posts_html, existing_comments_html
+
+
 def get_saved_posts(client):
     """Gets a list of posts that the user has saved."""
 
@@ -100,7 +133,10 @@ def save_media(post, location):
     # Can the media be obtained directly?
     if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS:
         filename = f"{readable_name}_{post.id}.{extension}"
-        response = requests.get(post.url)
+        try:
+            response = requests.get(post.url)
+        except:
+            return
         media_type = response.headers.get("Content-Type", "")
         if media_type.startswith("image") or media_type.startswith("video"):
             with open(os.path.join(location, "media", filename), "wb") as f:
@@ -138,7 +174,9 @@ def save_media(post, location):
             direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}'
             direct_url = direct_url.replace("i.imgur.com", "imgur.com")
             direct_url = direct_url.replace("m.imgur.com", "imgur.com")
-            response = requests.get(direct_url)
+            try:
+                response = requests.get(direct_url)
+            except: continue
             if response.status_code == 200:
                 filename = f"{readable_name}_{post.id}.{extension}"
                 with open(os.path.join(location, "media", filename), "wb") as f:
@@ -158,7 +196,8 @@ def save_media(post, location):
             try:
                 ydl.download([url])
             except:
-                pass
+                os.chdir(current)
+                return
         for f in os.listdir(os.path.join(location, "media")):
             if f.startswith(f"{readable_name}_{post.id}"):
                 return f
@@ -237,3 +276,25 @@ def get_comment_html(comment, children=True, op=None):
             children_html.append(get_comment_html(child, children=False, op=op))
         html = html.replace("<!--children-->", "\n".join(children_html))
     return html
+
+
+def save_html(posts, comments, location, html_file, page, has_next):
+    with open(os.path.join("html", html_file), encoding="utf-8") as f:
+        html = f.read()
+    with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
+        html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
+    with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
+        html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
+    if page == 0 or page is None:
+        html = html.replace("Previous</a>", "</a>")
+    else:
+        html = html.replace(".p.html", f".{page-1}.html")
+    if not has_next or page is None:
+        html = html.replace("Next</a>", "</a>")
+    else:
+        html = html.replace(".n.html", f".{page+1}.html")
+    html = html.replace("<!--posts-->", "\n".join(posts))
+    html = html.replace("<!--comments-->", "\n".join(comments))
+    file_name = html_file if page is None else html_file.replace(".html", f".{page}.html")
+    with open(os.path.join(location, file_name), "w", encoding="utf-8") as f:
+        f.write(html)