Add option to save to multiple smaller HTML files
This commit is contained in:
parent
085690feeb
commit
82ae100c9a
@ -44,6 +44,8 @@ Each post will have its top-level comments saved, as well as each of their immed
|
||||
|
||||
Linked media files (images, videos etc.) will be saved locally where possible, though imgur is currently not well supported in all cases.
|
||||
|
||||
If you want to also break the resultant HTML file into multiple files (as browsers struggle to display enormous HTML files) you can add the `--page-size 100` argument (replacing 100 with whatever the posts-per page you want is).
|
||||
|
||||
## Use with Docker
|
||||
|
||||
Rather than installing dependencies locally, you can use docker to create a local image and use that instead. First build the image:
|
||||
|
@ -6,6 +6,10 @@
|
||||
<script></script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="links">
|
||||
<a href="saved.p.html">Previous</a>
|
||||
<a href="saved.n.html">Next</a>
|
||||
</div>
|
||||
<section class="posts-section">
|
||||
<h1>Saved Posts</h1>
|
||||
<!--posts-->
|
||||
@ -14,5 +18,9 @@
|
||||
<h1>Saved Comments</h1>
|
||||
<!--comments-->
|
||||
</section>
|
||||
<div class="links">
|
||||
<a href="saved.n.html">Previous</a>
|
||||
<a href="saved.n.html">Next</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
@ -83,6 +83,13 @@ h1 {
|
||||
padding: 8px 16px;
|
||||
}
|
||||
|
||||
.links {
|
||||
padding: 12px 16px 0px;
|
||||
font-size: 12px;
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.post, .comment {
|
||||
border-top: 1px solid #f0f0f0;
|
||||
padding: 12px 16px;
|
||||
|
@ -6,7 +6,15 @@
|
||||
<script></script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="links">
|
||||
<a href="upvoted.p.html">Previous</a>
|
||||
<a href="upvoted.n.html">Next</a>
|
||||
</div>
|
||||
<h1>Upvoted Posts</h1>
|
||||
<!--posts-->
|
||||
<div class="links">
|
||||
<a href="upvoted.p.html">Previous</a>
|
||||
<a href="upvoted.n.html">Next</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
58
save.py
58
save.py
@ -9,11 +9,13 @@ from utilities import *
|
||||
# Get arguments
|
||||
parser = argparse.ArgumentParser(description="Save reddit posts to file.")
|
||||
parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.")
|
||||
|
||||
if os.getenv("DOCKER", "0") != "1":
|
||||
parser.add_argument("location", type=str, nargs=1, help="The path to save to.")
|
||||
# Optional page size argument
|
||||
parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.")
|
||||
args = parser.parse_args()
|
||||
mode = args.mode[0]
|
||||
page_size = args.page_size[0]
|
||||
location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0]
|
||||
|
||||
# Is location specified a directory?
|
||||
@ -39,20 +41,15 @@ if not os.path.exists(os.path.join(location, "media")):
|
||||
if not os.path.exists(os.path.join(location, "posts")):
|
||||
os.mkdir(os.path.join(location, "posts"))
|
||||
|
||||
# Are there any posts already?
|
||||
post_ids, existing_posts_html = [], []
|
||||
if os.path.exists(os.path.join(location, html_file)):
|
||||
with open(os.path.join(location, html_file), encoding="utf-8") as f:
|
||||
current_html = f.read()
|
||||
post_ids = re.findall(r'id="(.+?)"', current_html)
|
||||
existing_posts_html = re.findall(
|
||||
r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
|
||||
current_html
|
||||
)
|
||||
# Get files to search through
|
||||
print("Getting previously saved posts and comments...")
|
||||
existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file)
|
||||
print(len(existing_posts_html), "previous posts saved.")
|
||||
print(len(existing_comments_html), "previous comments saved.")
|
||||
|
||||
# Get posts HTML
|
||||
posts_html = []
|
||||
posts = [p for p in get_posts(client) if p.id not in post_ids]
|
||||
posts = [p for p in get_posts(client) if p.id not in existing_ids]
|
||||
if not posts:
|
||||
print("No new saved posts")
|
||||
else:
|
||||
@ -67,20 +64,9 @@ else:
|
||||
f.write(page_html)
|
||||
posts_html += existing_posts_html
|
||||
|
||||
# Are there any comments already?
|
||||
comment_ids, existing_comments_html = [], []
|
||||
if os.path.exists(os.path.join(location, html_file)):
|
||||
with open(os.path.join(location, html_file), encoding="utf-8") as f:
|
||||
current_html = f.read()
|
||||
comment_ids = re.findall(r'id="(.+?)"', current_html)
|
||||
existing_comments_html = re.findall(
|
||||
r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
|
||||
current_html
|
||||
)
|
||||
|
||||
# Get comments HTML
|
||||
comments_html = []
|
||||
comments = [c for c in get_comments(client) if c.id not in comment_ids]
|
||||
comments = [c for c in get_comments(client) if c.id not in existing_ids]
|
||||
if not comments:
|
||||
print("No new saved comments")
|
||||
else:
|
||||
@ -90,16 +76,14 @@ else:
|
||||
comments_html.append(comment_html)
|
||||
comments_html += existing_comments_html
|
||||
|
||||
# Save HTML
|
||||
with open(os.path.join("html", html_file), encoding="utf-8") as f:
|
||||
html = f.read()
|
||||
with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
|
||||
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
|
||||
with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
|
||||
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
|
||||
html = html.replace("<!--posts-->", "\n".join(posts_html))
|
||||
html = html.replace("<!--comments-->", "\n".join(comments_html))
|
||||
with open(os.path.join(location, html_file), "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
|
||||
|
||||
# Save overall HTML
|
||||
print("Saving HTML...")
|
||||
if page_size:
|
||||
length = max(len(posts_html), len(comments_html))
|
||||
page_count = (length // page_size) + 1
|
||||
for i in range(page_count):
|
||||
posts_on_page = posts_html[i*page_size:(i+1)*page_size]
|
||||
comments_on_page = comments_html[i*page_size:(i+1)*page_size]
|
||||
has_next = i < page_count - 1
|
||||
save_html(posts_on_page, comments_on_page, location, html_file, i, has_next)
|
||||
save_html(posts_html, comments_html, location, html_file, None, False)
|
63
utilities.py
63
utilities.py
@ -32,6 +32,39 @@ def make_client():
|
||||
)
|
||||
|
||||
|
||||
def get_previous(location, html_file):
|
||||
html_files = [f for f in os.listdir(location) if f.endswith(".html")]
|
||||
pattern = html_file.replace(".html", r"\.(\d+)?\.html")
|
||||
matches = [re.match(pattern, f) for f in html_files]
|
||||
matches = [m[0] for m in matches if m]
|
||||
matches.sort(key=lambda x: int(x.split(".")[1]))
|
||||
existing_ids = []
|
||||
existing_posts_html = []
|
||||
existing_comments_html = []
|
||||
if html_file in html_files: matches.append(html_file)
|
||||
for match in matches:
|
||||
with open(os.path.join(location, match), encoding="utf-8") as f:
|
||||
current_html = f.read()
|
||||
for id in re.findall(r'id="(.+?)"', current_html):
|
||||
if id not in existing_ids:
|
||||
existing_ids.append(id)
|
||||
posts = re.findall(
|
||||
r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
|
||||
current_html
|
||||
)
|
||||
comments = re.findall(
|
||||
r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
|
||||
current_html
|
||||
)
|
||||
for post in posts:
|
||||
if post not in existing_posts_html:
|
||||
existing_posts_html.append(post)
|
||||
for comment in comments:
|
||||
if comment not in existing_comments_html:
|
||||
existing_comments_html.append(comment)
|
||||
return existing_ids, existing_posts_html, existing_comments_html
|
||||
|
||||
|
||||
def get_saved_posts(client):
|
||||
"""Gets a list of posts that the user has saved."""
|
||||
|
||||
@ -100,7 +133,10 @@ def save_media(post, location):
|
||||
# Can the media be obtained directly?
|
||||
if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS:
|
||||
filename = f"{readable_name}_{post.id}.{extension}"
|
||||
try:
|
||||
response = requests.get(post.url)
|
||||
except:
|
||||
return
|
||||
media_type = response.headers.get("Content-Type", "")
|
||||
if media_type.startswith("image") or media_type.startswith("video"):
|
||||
with open(os.path.join(location, "media", filename), "wb") as f:
|
||||
@ -138,7 +174,9 @@ def save_media(post, location):
|
||||
direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}'
|
||||
direct_url = direct_url.replace("i.imgur.com", "imgur.com")
|
||||
direct_url = direct_url.replace("m.imgur.com", "imgur.com")
|
||||
try:
|
||||
response = requests.get(direct_url)
|
||||
except: continue
|
||||
if response.status_code == 200:
|
||||
filename = f"{readable_name}_{post.id}.{extension}"
|
||||
with open(os.path.join(location, "media", filename), "wb") as f:
|
||||
@ -158,7 +196,8 @@ def save_media(post, location):
|
||||
try:
|
||||
ydl.download([url])
|
||||
except:
|
||||
pass
|
||||
os.chdir(current)
|
||||
return
|
||||
for f in os.listdir(os.path.join(location, "media")):
|
||||
if f.startswith(f"{readable_name}_{post.id}"):
|
||||
return f
|
||||
@ -237,3 +276,25 @@ def get_comment_html(comment, children=True, op=None):
|
||||
children_html.append(get_comment_html(child, children=False, op=op))
|
||||
html = html.replace("<!--children-->", "\n".join(children_html))
|
||||
return html
|
||||
|
||||
|
||||
def save_html(posts, comments, location, html_file, page, has_next):
|
||||
with open(os.path.join("html", html_file), encoding="utf-8") as f:
|
||||
html = f.read()
|
||||
with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
|
||||
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
|
||||
with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
|
||||
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
|
||||
if page == 0 or page is None:
|
||||
html = html.replace("Previous</a>", "</a>")
|
||||
else:
|
||||
html = html.replace(".p.html", f".{page-1}.html")
|
||||
if not has_next or page is None:
|
||||
html = html.replace("Next</a>", "</a>")
|
||||
else:
|
||||
html = html.replace(".n.html", f".{page+1}.html")
|
||||
html = html.replace("<!--posts-->", "\n".join(posts))
|
||||
html = html.replace("<!--comments-->", "\n".join(comments))
|
||||
file_name = html_file if page is None else html_file.replace(".html", f".{page}.html")
|
||||
with open(os.path.join(location, file_name), "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
|
Loading…
Reference in New Issue
Block a user