Add option to save to multiple smaller HTML files

2023-06-18 18:23:42 +01:00 · 2023-06-18 18:23:42 +01:00 · 82ae100c9a
commit 82ae100c9a
parent 085690feeb
6 changed files with 110 additions and 40 deletions
--- a/README.md
+++ b/README.md
@ -44,6 +44,8 @@ Each post will have its top-level comments saved, as well as each of their immed
 Linked media files (images, videos etc.) will be saved locally where possible, though imgur is currently not well supported in all cases.
 If you want to also break the resultant HTML file into multiple files (as browsers struggle to display enormous HTML files) you can add the `--page-size 100` argument (replacing 100 with whatever the posts-per page you want is).
 ## Use with Docker
 Rather than installing dependencies locally, you can use docker to create a local image and use that instead. First build the image:
--- a/html/saved.html
+++ b/html/saved.html
@ -6,6 +6,10 @@
 <script></script>
 </head>
 <body>
 <div class="links">
 <a href="saved.p.html">Previous</a>
 <a href="saved.n.html">Next</a>
 </div>
 <section class="posts-section">
 <h1>Saved Posts</h1>
 <!--posts-->
@ -14,5 +18,9 @@
 <h1>Saved Comments</h1>
 <!--comments-->
 </section>
 <div class="links">
 <a href="saved.n.html">Previous</a>
 <a href="saved.n.html">Next</a>
 </div>
 </body>
 </html>
--- a/html/style.css
+++ b/html/style.css
@ -83,6 +83,13 @@ h1 {
    padding: 8px 16px;
 }
 .links {
    padding: 12px 16px 0px;
    font-size: 12px;
    display: flex;
    gap: 8px;
 }
 .post, .comment {
    border-top: 1px solid #f0f0f0;
    padding: 12px 16px;
--- a/html/upvoted.html
+++ b/html/upvoted.html
@ -6,7 +6,15 @@
 <script></script>
 </head>
 <body>
 <div class="links">
 <a href="upvoted.p.html">Previous</a>
 <a href="upvoted.n.html">Next</a>
 </div>
 <h1>Upvoted Posts</h1>
 <!--posts-->
 <div class="links">
 <a href="upvoted.p.html">Previous</a>
 <a href="upvoted.n.html">Next</a>
 </div>
 </body>
 </html>
--- a/save.py
+++ b/save.py
@ -9,11 +9,13 @@ from utilities import *
 # Get arguments
 parser = argparse.ArgumentParser(description="Save reddit posts to file.")
 parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.")
 if os.getenv("DOCKER", "0") != "1":
    parser.add_argument("location", type=str, nargs=1, help="The path to save to.")
 # Optional page size argument
 parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.")
 args = parser.parse_args()
 mode = args.mode[0]
 page_size = args.page_size[0]
 location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0]
 # Is location specified a directory?
@ -39,20 +41,15 @@ if not os.path.exists(os.path.join(location, "media")):
 if not os.path.exists(os.path.join(location, "posts")):
    os.mkdir(os.path.join(location, "posts"))
-# Are there any posts already?
+# Get files to search through
-post_ids, existing_posts_html = [], []
+print("Getting previously saved posts and comments...")
-if os.path.exists(os.path.join(location, html_file)):
+existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file)
-    with open(os.path.join(location, html_file), encoding="utf-8") as f:
+print(len(existing_posts_html), "previous posts saved.")
-        current_html = f.read()
+print(len(existing_comments_html), "previous comments saved.")
        post_ids = re.findall(r'id="(.+?)"', current_html)
        existing_posts_html = re.findall(
            r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
            current_html
        )
 # Get posts HTML
 posts_html = []
-posts = [p for p in get_posts(client) if p.id not in post_ids]
+posts = [p for p in get_posts(client) if p.id not in existing_ids]
 if not posts:
    print("No new saved posts")
 else:
@ -67,20 +64,9 @@ else:
            f.write(page_html)
 posts_html += existing_posts_html
 # Are there any comments already?
 comment_ids, existing_comments_html = [], []
 if os.path.exists(os.path.join(location, html_file)):
    with open(os.path.join(location, html_file), encoding="utf-8") as f:
        current_html = f.read()
        comment_ids = re.findall(r'id="(.+?)"', current_html)
        existing_comments_html = re.findall(
            r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
            current_html
        )
 # Get comments HTML
 comments_html = []
-comments = [c for c in get_comments(client) if c.id not in comment_ids]
+comments = [c for c in get_comments(client) if c.id not in existing_ids]
 if not comments:
    print("No new saved comments")
 else:
@ -90,16 +76,14 @@ else:
        comments_html.append(comment_html)
 comments_html += existing_comments_html
-# Save HTML
+# Save overall HTML
-with open(os.path.join("html", html_file), encoding="utf-8") as f:
+print("Saving HTML...")
-    html = f.read()
+if page_size:
-with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
+    length = max(len(posts_html), len(comments_html))
-    html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
+    page_count = (length // page_size) + 1
-with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
+    for i in range(page_count):
-    html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
+        posts_on_page = posts_html[i*page_size:(i+1)*page_size]
-html = html.replace("<!--posts-->", "\n".join(posts_html))
+        comments_on_page = comments_html[i*page_size:(i+1)*page_size]
-html = html.replace("<!--comments-->", "\n".join(comments_html))
+        has_next = i < page_count - 1
-with open(os.path.join(location, html_file), "w", encoding="utf-8") as f:
+        save_html(posts_on_page, comments_on_page, location, html_file, i, has_next)
-    f.write(html)
+save_html(posts_html, comments_html, location, html_file, None, False)
--- a/utilities.py
+++ b/utilities.py
@ -32,6 +32,39 @@ def make_client():
    )
 def get_previous(location, html_file):
    html_files = [f for f in os.listdir(location) if f.endswith(".html")]
    pattern = html_file.replace(".html", r"\.(\d+)?\.html")
    matches = [re.match(pattern, f) for f in html_files]
    matches = [m[0] for m in matches if m]
    matches.sort(key=lambda x: int(x.split(".")[1]))
    existing_ids = []
    existing_posts_html = []
    existing_comments_html = []
    if html_file in html_files: matches.append(html_file)
    for match in matches:
        with open(os.path.join(location, match), encoding="utf-8") as f:
            current_html = f.read()
            for id in re.findall(r'id="(.+?)"', current_html):
                if id not in existing_ids:
                    existing_ids.append(id)
            posts = re.findall(
                r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
                current_html
            )
            comments = re.findall(
                r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
                current_html
            )
            for post in posts:
                if post not in existing_posts_html:
                    existing_posts_html.append(post)
            for comment in comments:
                if comment not in existing_comments_html:
                    existing_comments_html.append(comment)
    return existing_ids, existing_posts_html, existing_comments_html
 def get_saved_posts(client):
    """Gets a list of posts that the user has saved."""
@ -100,7 +133,10 @@ def save_media(post, location):
    # Can the media be obtained directly?
    if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS:
        filename = f"{readable_name}_{post.id}.{extension}"
        try:
            response = requests.get(post.url)
        except:
            return
        media_type = response.headers.get("Content-Type", "")
        if media_type.startswith("image") or media_type.startswith("video"):
            with open(os.path.join(location, "media", filename), "wb") as f:
@ -138,7 +174,9 @@ def save_media(post, location):
            direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}'
            direct_url = direct_url.replace("i.imgur.com", "imgur.com")
            direct_url = direct_url.replace("m.imgur.com", "imgur.com")
            try:
                response = requests.get(direct_url)
            except: continue
            if response.status_code == 200:
                filename = f"{readable_name}_{post.id}.{extension}"
                with open(os.path.join(location, "media", filename), "wb") as f:
@ -158,7 +196,8 @@ def save_media(post, location):
            try:
                ydl.download([url])
            except:
-                pass
+                os.chdir(current)
                return
        for f in os.listdir(os.path.join(location, "media")):
            if f.startswith(f"{readable_name}_{post.id}"):
                return f
@ -237,3 +276,25 @@ def get_comment_html(comment, children=True, op=None):
            children_html.append(get_comment_html(child, children=False, op=op))
        html = html.replace("<!--children-->", "\n".join(children_html))
    return html
 def save_html(posts, comments, location, html_file, page, has_next):
    with open(os.path.join("html", html_file), encoding="utf-8") as f:
        html = f.read()
    with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
        html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
    with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
        html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
    if page == 0 or page is None:
        html = html.replace("Previous</a>", "</a>")
    else:
        html = html.replace(".p.html", f".{page-1}.html")
    if not has_next or page is None:
        html = html.replace("Next</a>", "</a>")
    else:
        html = html.replace(".n.html", f".{page+1}.html")
    html = html.replace("<!--posts-->", "\n".join(posts))
    html = html.replace("<!--comments-->", "\n".join(comments))
    file_name = html_file if page is None else html_file.replace(".html", f".{page}.html")
    with open(os.path.join(location, file_name), "w", encoding="utf-8") as f:
        f.write(html)