Add option to save to multiple smaller HTML files

This commit is contained in:
Sam Ireland 2023-06-18 18:23:42 +01:00
parent 085690feeb
commit 82ae100c9a
6 changed files with 110 additions and 40 deletions

View File

@ -44,6 +44,8 @@ Each post will have its top-level comments saved, as well as each of their immed
Linked media files (images, videos etc.) will be saved locally where possible, though imgur is currently not well supported in all cases.
If you want to also break the resultant HTML file into multiple files (as browsers struggle to display enormous HTML files) you can add the `--page-size 100` argument (replacing 100 with whatever the posts-per page you want is).
## Use with Docker
Rather than installing dependencies locally, you can use docker to create a local image and use that instead. First build the image:

View File

@ -6,6 +6,10 @@
<script></script>
</head>
<body>
<div class="links">
<a href="saved.p.html">Previous</a>
<a href="saved.n.html">Next</a>
</div>
<section class="posts-section">
<h1>Saved Posts</h1>
<!--posts-->
@ -14,5 +18,9 @@
<h1>Saved Comments</h1>
<!--comments-->
</section>
<div class="links">
<a href="saved.n.html">Previous</a>
<a href="saved.n.html">Next</a>
</div>
</body>
</html>

View File

@ -83,6 +83,13 @@ h1 {
padding: 8px 16px;
}
.links {
padding: 12px 16px 0px;
font-size: 12px;
display: flex;
gap: 8px;
}
.post, .comment {
border-top: 1px solid #f0f0f0;
padding: 12px 16px;

View File

@ -6,7 +6,15 @@
<script></script>
</head>
<body>
<div class="links">
<a href="upvoted.p.html">Previous</a>
<a href="upvoted.n.html">Next</a>
</div>
<h1>Upvoted Posts</h1>
<!--posts-->
<div class="links">
<a href="upvoted.p.html">Previous</a>
<a href="upvoted.n.html">Next</a>
</div>
</body>
</html>

58
save.py
View File

@ -9,11 +9,13 @@ from utilities import *
# Get arguments
parser = argparse.ArgumentParser(description="Save reddit posts to file.")
parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.")
if os.getenv("DOCKER", "0") != "1":
parser.add_argument("location", type=str, nargs=1, help="The path to save to.")
# Optional page size argument
parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.")
args = parser.parse_args()
mode = args.mode[0]
page_size = args.page_size[0]
location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0]
# Is location specified a directory?
@ -39,20 +41,15 @@ if not os.path.exists(os.path.join(location, "media")):
if not os.path.exists(os.path.join(location, "posts")):
os.mkdir(os.path.join(location, "posts"))
# Are there any posts already?
post_ids, existing_posts_html = [], []
if os.path.exists(os.path.join(location, html_file)):
with open(os.path.join(location, html_file), encoding="utf-8") as f:
current_html = f.read()
post_ids = re.findall(r'id="(.+?)"', current_html)
existing_posts_html = re.findall(
r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
current_html
)
# Get files to search through
print("Getting previously saved posts and comments...")
existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file)
print(len(existing_posts_html), "previous posts saved.")
print(len(existing_comments_html), "previous comments saved.")
# Get posts HTML
posts_html = []
posts = [p for p in get_posts(client) if p.id not in post_ids]
posts = [p for p in get_posts(client) if p.id not in existing_ids]
if not posts:
print("No new saved posts")
else:
@ -67,20 +64,9 @@ else:
f.write(page_html)
posts_html += existing_posts_html
# Are there any comments already?
comment_ids, existing_comments_html = [], []
if os.path.exists(os.path.join(location, html_file)):
with open(os.path.join(location, html_file), encoding="utf-8") as f:
current_html = f.read()
comment_ids = re.findall(r'id="(.+?)"', current_html)
existing_comments_html = re.findall(
r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
current_html
)
# Get comments HTML
comments_html = []
comments = [c for c in get_comments(client) if c.id not in comment_ids]
comments = [c for c in get_comments(client) if c.id not in existing_ids]
if not comments:
print("No new saved comments")
else:
@ -90,16 +76,14 @@ else:
comments_html.append(comment_html)
comments_html += existing_comments_html
# Save HTML
with open(os.path.join("html", html_file), encoding="utf-8") as f:
html = f.read()
with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
html = html.replace("<!--posts-->", "\n".join(posts_html))
html = html.replace("<!--comments-->", "\n".join(comments_html))
with open(os.path.join(location, html_file), "w", encoding="utf-8") as f:
f.write(html)
# Save overall HTML
print("Saving HTML...")
if page_size:
length = max(len(posts_html), len(comments_html))
page_count = (length // page_size) + 1
for i in range(page_count):
posts_on_page = posts_html[i*page_size:(i+1)*page_size]
comments_on_page = comments_html[i*page_size:(i+1)*page_size]
has_next = i < page_count - 1
save_html(posts_on_page, comments_on_page, location, html_file, i, has_next)
save_html(posts_html, comments_html, location, html_file, None, False)

View File

@ -32,6 +32,39 @@ def make_client():
)
def get_previous(location, html_file):
html_files = [f for f in os.listdir(location) if f.endswith(".html")]
pattern = html_file.replace(".html", r"\.(\d+)?\.html")
matches = [re.match(pattern, f) for f in html_files]
matches = [m[0] for m in matches if m]
matches.sort(key=lambda x: int(x.split(".")[1]))
existing_ids = []
existing_posts_html = []
existing_comments_html = []
if html_file in html_files: matches.append(html_file)
for match in matches:
with open(os.path.join(location, match), encoding="utf-8") as f:
current_html = f.read()
for id in re.findall(r'id="(.+?)"', current_html):
if id not in existing_ids:
existing_ids.append(id)
posts = re.findall(
r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
current_html
)
comments = re.findall(
r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
current_html
)
for post in posts:
if post not in existing_posts_html:
existing_posts_html.append(post)
for comment in comments:
if comment not in existing_comments_html:
existing_comments_html.append(comment)
return existing_ids, existing_posts_html, existing_comments_html
def get_saved_posts(client):
"""Gets a list of posts that the user has saved."""
@ -100,7 +133,10 @@ def save_media(post, location):
# Can the media be obtained directly?
if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS:
filename = f"{readable_name}_{post.id}.{extension}"
response = requests.get(post.url)
try:
response = requests.get(post.url)
except:
return
media_type = response.headers.get("Content-Type", "")
if media_type.startswith("image") or media_type.startswith("video"):
with open(os.path.join(location, "media", filename), "wb") as f:
@ -138,7 +174,9 @@ def save_media(post, location):
direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}'
direct_url = direct_url.replace("i.imgur.com", "imgur.com")
direct_url = direct_url.replace("m.imgur.com", "imgur.com")
response = requests.get(direct_url)
try:
response = requests.get(direct_url)
except: continue
if response.status_code == 200:
filename = f"{readable_name}_{post.id}.{extension}"
with open(os.path.join(location, "media", filename), "wb") as f:
@ -158,7 +196,8 @@ def save_media(post, location):
try:
ydl.download([url])
except:
pass
os.chdir(current)
return
for f in os.listdir(os.path.join(location, "media")):
if f.startswith(f"{readable_name}_{post.id}"):
return f
@ -237,3 +276,25 @@ def get_comment_html(comment, children=True, op=None):
children_html.append(get_comment_html(child, children=False, op=op))
html = html.replace("<!--children-->", "\n".join(children_html))
return html
def save_html(posts, comments, location, html_file, page, has_next):
with open(os.path.join("html", html_file), encoding="utf-8") as f:
html = f.read()
with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
if page == 0 or page is None:
html = html.replace("Previous</a>", "</a>")
else:
html = html.replace(".p.html", f".{page-1}.html")
if not has_next or page is None:
html = html.replace("Next</a>", "</a>")
else:
html = html.replace(".n.html", f".{page+1}.html")
html = html.replace("<!--posts-->", "\n".join(posts))
html = html.replace("<!--comments-->", "\n".join(comments))
file_name = html_file if page is None else html_file.replace(".html", f".{page}.html")
with open(os.path.join(location, file_name), "w", encoding="utf-8") as f:
f.write(html)