Add option to save to multiple smaller HTML files

This commit is contained in:
Sam Ireland 2023-06-18 18:23:42 +01:00
parent 085690feeb
commit 82ae100c9a
6 changed files with 110 additions and 40 deletions

View File

@ -44,6 +44,8 @@ Each post will have its top-level comments saved, as well as each of their immed
Linked media files (images, videos etc.) will be saved locally where possible, though imgur is currently not well supported in all cases. Linked media files (images, videos etc.) will be saved locally where possible, though imgur is currently not well supported in all cases.
If you want to also break the resultant HTML file into multiple files (as browsers struggle to display enormous HTML files) you can add the `--page-size 100` argument (replacing 100 with whatever the posts-per page you want is).
## Use with Docker ## Use with Docker
Rather than installing dependencies locally, you can use docker to create a local image and use that instead. First build the image: Rather than installing dependencies locally, you can use docker to create a local image and use that instead. First build the image:

View File

@ -6,6 +6,10 @@
<script></script> <script></script>
</head> </head>
<body> <body>
<div class="links">
<a href="saved.p.html">Previous</a>
<a href="saved.n.html">Next</a>
</div>
<section class="posts-section"> <section class="posts-section">
<h1>Saved Posts</h1> <h1>Saved Posts</h1>
<!--posts--> <!--posts-->
@ -14,5 +18,9 @@
<h1>Saved Comments</h1> <h1>Saved Comments</h1>
<!--comments--> <!--comments-->
</section> </section>
<div class="links">
<a href="saved.n.html">Previous</a>
<a href="saved.n.html">Next</a>
</div>
</body> </body>
</html> </html>

View File

@ -83,6 +83,13 @@ h1 {
padding: 8px 16px; padding: 8px 16px;
} }
.links {
padding: 12px 16px 0px;
font-size: 12px;
display: flex;
gap: 8px;
}
.post, .comment { .post, .comment {
border-top: 1px solid #f0f0f0; border-top: 1px solid #f0f0f0;
padding: 12px 16px; padding: 12px 16px;

View File

@ -6,7 +6,15 @@
<script></script> <script></script>
</head> </head>
<body> <body>
<div class="links">
<a href="upvoted.p.html">Previous</a>
<a href="upvoted.n.html">Next</a>
</div>
<h1>Upvoted Posts</h1> <h1>Upvoted Posts</h1>
<!--posts--> <!--posts-->
<div class="links">
<a href="upvoted.p.html">Previous</a>
<a href="upvoted.n.html">Next</a>
</div>
</body> </body>
</html> </html>

58
save.py
View File

@ -9,11 +9,13 @@ from utilities import *
# Get arguments # Get arguments
parser = argparse.ArgumentParser(description="Save reddit posts to file.") parser = argparse.ArgumentParser(description="Save reddit posts to file.")
parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.") parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.")
if os.getenv("DOCKER", "0") != "1": if os.getenv("DOCKER", "0") != "1":
parser.add_argument("location", type=str, nargs=1, help="The path to save to.") parser.add_argument("location", type=str, nargs=1, help="The path to save to.")
# Optional page size argument
parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.")
args = parser.parse_args() args = parser.parse_args()
mode = args.mode[0] mode = args.mode[0]
page_size = args.page_size[0]
location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0] location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0]
# Is location specified a directory? # Is location specified a directory?
@ -39,20 +41,15 @@ if not os.path.exists(os.path.join(location, "media")):
if not os.path.exists(os.path.join(location, "posts")): if not os.path.exists(os.path.join(location, "posts")):
os.mkdir(os.path.join(location, "posts")) os.mkdir(os.path.join(location, "posts"))
# Are there any posts already? # Get files to search through
post_ids, existing_posts_html = [], [] print("Getting previously saved posts and comments...")
if os.path.exists(os.path.join(location, html_file)): existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file)
with open(os.path.join(location, html_file), encoding="utf-8") as f: print(len(existing_posts_html), "previous posts saved.")
current_html = f.read() print(len(existing_comments_html), "previous comments saved.")
post_ids = re.findall(r'id="(.+?)"', current_html)
existing_posts_html = re.findall(
r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
current_html
)
# Get posts HTML # Get posts HTML
posts_html = [] posts_html = []
posts = [p for p in get_posts(client) if p.id not in post_ids] posts = [p for p in get_posts(client) if p.id not in existing_ids]
if not posts: if not posts:
print("No new saved posts") print("No new saved posts")
else: else:
@ -67,20 +64,9 @@ else:
f.write(page_html) f.write(page_html)
posts_html += existing_posts_html posts_html += existing_posts_html
# Are there any comments already?
comment_ids, existing_comments_html = [], []
if os.path.exists(os.path.join(location, html_file)):
with open(os.path.join(location, html_file), encoding="utf-8") as f:
current_html = f.read()
comment_ids = re.findall(r'id="(.+?)"', current_html)
existing_comments_html = re.findall(
r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
current_html
)
# Get comments HTML # Get comments HTML
comments_html = [] comments_html = []
comments = [c for c in get_comments(client) if c.id not in comment_ids] comments = [c for c in get_comments(client) if c.id not in existing_ids]
if not comments: if not comments:
print("No new saved comments") print("No new saved comments")
else: else:
@ -90,16 +76,14 @@ else:
comments_html.append(comment_html) comments_html.append(comment_html)
comments_html += existing_comments_html comments_html += existing_comments_html
# Save HTML # Save overall HTML
with open(os.path.join("html", html_file), encoding="utf-8") as f: print("Saving HTML...")
html = f.read() if page_size:
with open(os.path.join("html", "style.css"), encoding="utf-8") as f: length = max(len(posts_html), len(comments_html))
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>") page_count = (length // page_size) + 1
with open(os.path.join("html", "main.js"), encoding="utf-8") as f: for i in range(page_count):
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>") posts_on_page = posts_html[i*page_size:(i+1)*page_size]
html = html.replace("<!--posts-->", "\n".join(posts_html)) comments_on_page = comments_html[i*page_size:(i+1)*page_size]
html = html.replace("<!--comments-->", "\n".join(comments_html)) has_next = i < page_count - 1
with open(os.path.join(location, html_file), "w", encoding="utf-8") as f: save_html(posts_on_page, comments_on_page, location, html_file, i, has_next)
f.write(html) save_html(posts_html, comments_html, location, html_file, None, False)

View File

@ -32,6 +32,39 @@ def make_client():
) )
def get_previous(location, html_file):
html_files = [f for f in os.listdir(location) if f.endswith(".html")]
pattern = html_file.replace(".html", r"\.(\d+)?\.html")
matches = [re.match(pattern, f) for f in html_files]
matches = [m[0] for m in matches if m]
matches.sort(key=lambda x: int(x.split(".")[1]))
existing_ids = []
existing_posts_html = []
existing_comments_html = []
if html_file in html_files: matches.append(html_file)
for match in matches:
with open(os.path.join(location, match), encoding="utf-8") as f:
current_html = f.read()
for id in re.findall(r'id="(.+?)"', current_html):
if id not in existing_ids:
existing_ids.append(id)
posts = re.findall(
r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
current_html
)
comments = re.findall(
r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
current_html
)
for post in posts:
if post not in existing_posts_html:
existing_posts_html.append(post)
for comment in comments:
if comment not in existing_comments_html:
existing_comments_html.append(comment)
return existing_ids, existing_posts_html, existing_comments_html
def get_saved_posts(client): def get_saved_posts(client):
"""Gets a list of posts that the user has saved.""" """Gets a list of posts that the user has saved."""
@ -100,7 +133,10 @@ def save_media(post, location):
# Can the media be obtained directly? # Can the media be obtained directly?
if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS: if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS:
filename = f"{readable_name}_{post.id}.{extension}" filename = f"{readable_name}_{post.id}.{extension}"
try:
response = requests.get(post.url) response = requests.get(post.url)
except:
return
media_type = response.headers.get("Content-Type", "") media_type = response.headers.get("Content-Type", "")
if media_type.startswith("image") or media_type.startswith("video"): if media_type.startswith("image") or media_type.startswith("video"):
with open(os.path.join(location, "media", filename), "wb") as f: with open(os.path.join(location, "media", filename), "wb") as f:
@ -138,7 +174,9 @@ def save_media(post, location):
direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}' direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}'
direct_url = direct_url.replace("i.imgur.com", "imgur.com") direct_url = direct_url.replace("i.imgur.com", "imgur.com")
direct_url = direct_url.replace("m.imgur.com", "imgur.com") direct_url = direct_url.replace("m.imgur.com", "imgur.com")
try:
response = requests.get(direct_url) response = requests.get(direct_url)
except: continue
if response.status_code == 200: if response.status_code == 200:
filename = f"{readable_name}_{post.id}.{extension}" filename = f"{readable_name}_{post.id}.{extension}"
with open(os.path.join(location, "media", filename), "wb") as f: with open(os.path.join(location, "media", filename), "wb") as f:
@ -158,7 +196,8 @@ def save_media(post, location):
try: try:
ydl.download([url]) ydl.download([url])
except: except:
pass os.chdir(current)
return
for f in os.listdir(os.path.join(location, "media")): for f in os.listdir(os.path.join(location, "media")):
if f.startswith(f"{readable_name}_{post.id}"): if f.startswith(f"{readable_name}_{post.id}"):
return f return f
@ -237,3 +276,25 @@ def get_comment_html(comment, children=True, op=None):
children_html.append(get_comment_html(child, children=False, op=op)) children_html.append(get_comment_html(child, children=False, op=op))
html = html.replace("<!--children-->", "\n".join(children_html)) html = html.replace("<!--children-->", "\n".join(children_html))
return html return html
def save_html(posts, comments, location, html_file, page, has_next):
with open(os.path.join("html", html_file), encoding="utf-8") as f:
html = f.read()
with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
if page == 0 or page is None:
html = html.replace("Previous</a>", "</a>")
else:
html = html.replace(".p.html", f".{page-1}.html")
if not has_next or page is None:
html = html.replace("Next</a>", "</a>")
else:
html = html.replace(".n.html", f".{page+1}.html")
html = html.replace("<!--posts-->", "\n".join(posts))
html = html.replace("<!--comments-->", "\n".join(comments))
file_name = html_file if page is None else html_file.replace(".html", f".{page}.html")
with open(os.path.join(location, file_name), "w", encoding="utf-8") as f:
f.write(html)