Add option to save to multiple smaller HTML files
This commit is contained in:
parent
085690feeb
commit
82ae100c9a
@ -44,6 +44,8 @@ Each post will have its top-level comments saved, as well as each of their immed
|
|||||||
|
|
||||||
Linked media files (images, videos etc.) will be saved locally where possible, though imgur is currently not well supported in all cases.
|
Linked media files (images, videos etc.) will be saved locally where possible, though imgur is currently not well supported in all cases.
|
||||||
|
|
||||||
|
If you want to also break the resultant HTML file into multiple files (as browsers struggle to display enormous HTML files) you can add the `--page-size 100` argument (replacing 100 with whatever the posts-per page you want is).
|
||||||
|
|
||||||
## Use with Docker
|
## Use with Docker
|
||||||
|
|
||||||
Rather than installing dependencies locally, you can use docker to create a local image and use that instead. First build the image:
|
Rather than installing dependencies locally, you can use docker to create a local image and use that instead. First build the image:
|
||||||
|
@ -6,6 +6,10 @@
|
|||||||
<script></script>
|
<script></script>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
<div class="links">
|
||||||
|
<a href="saved.p.html">Previous</a>
|
||||||
|
<a href="saved.n.html">Next</a>
|
||||||
|
</div>
|
||||||
<section class="posts-section">
|
<section class="posts-section">
|
||||||
<h1>Saved Posts</h1>
|
<h1>Saved Posts</h1>
|
||||||
<!--posts-->
|
<!--posts-->
|
||||||
@ -14,5 +18,9 @@
|
|||||||
<h1>Saved Comments</h1>
|
<h1>Saved Comments</h1>
|
||||||
<!--comments-->
|
<!--comments-->
|
||||||
</section>
|
</section>
|
||||||
|
<div class="links">
|
||||||
|
<a href="saved.n.html">Previous</a>
|
||||||
|
<a href="saved.n.html">Next</a>
|
||||||
|
</div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
@ -83,6 +83,13 @@ h1 {
|
|||||||
padding: 8px 16px;
|
padding: 8px 16px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.links {
|
||||||
|
padding: 12px 16px 0px;
|
||||||
|
font-size: 12px;
|
||||||
|
display: flex;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
.post, .comment {
|
.post, .comment {
|
||||||
border-top: 1px solid #f0f0f0;
|
border-top: 1px solid #f0f0f0;
|
||||||
padding: 12px 16px;
|
padding: 12px 16px;
|
||||||
|
@ -6,7 +6,15 @@
|
|||||||
<script></script>
|
<script></script>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
<div class="links">
|
||||||
|
<a href="upvoted.p.html">Previous</a>
|
||||||
|
<a href="upvoted.n.html">Next</a>
|
||||||
|
</div>
|
||||||
<h1>Upvoted Posts</h1>
|
<h1>Upvoted Posts</h1>
|
||||||
<!--posts-->
|
<!--posts-->
|
||||||
|
<div class="links">
|
||||||
|
<a href="upvoted.p.html">Previous</a>
|
||||||
|
<a href="upvoted.n.html">Next</a>
|
||||||
|
</div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
58
save.py
58
save.py
@ -9,11 +9,13 @@ from utilities import *
|
|||||||
# Get arguments
|
# Get arguments
|
||||||
parser = argparse.ArgumentParser(description="Save reddit posts to file.")
|
parser = argparse.ArgumentParser(description="Save reddit posts to file.")
|
||||||
parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.")
|
parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.")
|
||||||
|
|
||||||
if os.getenv("DOCKER", "0") != "1":
|
if os.getenv("DOCKER", "0") != "1":
|
||||||
parser.add_argument("location", type=str, nargs=1, help="The path to save to.")
|
parser.add_argument("location", type=str, nargs=1, help="The path to save to.")
|
||||||
|
# Optional page size argument
|
||||||
|
parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
mode = args.mode[0]
|
mode = args.mode[0]
|
||||||
|
page_size = args.page_size[0]
|
||||||
location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0]
|
location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0]
|
||||||
|
|
||||||
# Is location specified a directory?
|
# Is location specified a directory?
|
||||||
@ -39,20 +41,15 @@ if not os.path.exists(os.path.join(location, "media")):
|
|||||||
if not os.path.exists(os.path.join(location, "posts")):
|
if not os.path.exists(os.path.join(location, "posts")):
|
||||||
os.mkdir(os.path.join(location, "posts"))
|
os.mkdir(os.path.join(location, "posts"))
|
||||||
|
|
||||||
# Are there any posts already?
|
# Get files to search through
|
||||||
post_ids, existing_posts_html = [], []
|
print("Getting previously saved posts and comments...")
|
||||||
if os.path.exists(os.path.join(location, html_file)):
|
existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file)
|
||||||
with open(os.path.join(location, html_file), encoding="utf-8") as f:
|
print(len(existing_posts_html), "previous posts saved.")
|
||||||
current_html = f.read()
|
print(len(existing_comments_html), "previous comments saved.")
|
||||||
post_ids = re.findall(r'id="(.+?)"', current_html)
|
|
||||||
existing_posts_html = re.findall(
|
|
||||||
r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
|
|
||||||
current_html
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get posts HTML
|
# Get posts HTML
|
||||||
posts_html = []
|
posts_html = []
|
||||||
posts = [p for p in get_posts(client) if p.id not in post_ids]
|
posts = [p for p in get_posts(client) if p.id not in existing_ids]
|
||||||
if not posts:
|
if not posts:
|
||||||
print("No new saved posts")
|
print("No new saved posts")
|
||||||
else:
|
else:
|
||||||
@ -67,20 +64,9 @@ else:
|
|||||||
f.write(page_html)
|
f.write(page_html)
|
||||||
posts_html += existing_posts_html
|
posts_html += existing_posts_html
|
||||||
|
|
||||||
# Are there any comments already?
|
|
||||||
comment_ids, existing_comments_html = [], []
|
|
||||||
if os.path.exists(os.path.join(location, html_file)):
|
|
||||||
with open(os.path.join(location, html_file), encoding="utf-8") as f:
|
|
||||||
current_html = f.read()
|
|
||||||
comment_ids = re.findall(r'id="(.+?)"', current_html)
|
|
||||||
existing_comments_html = re.findall(
|
|
||||||
r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
|
|
||||||
current_html
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get comments HTML
|
# Get comments HTML
|
||||||
comments_html = []
|
comments_html = []
|
||||||
comments = [c for c in get_comments(client) if c.id not in comment_ids]
|
comments = [c for c in get_comments(client) if c.id not in existing_ids]
|
||||||
if not comments:
|
if not comments:
|
||||||
print("No new saved comments")
|
print("No new saved comments")
|
||||||
else:
|
else:
|
||||||
@ -90,16 +76,14 @@ else:
|
|||||||
comments_html.append(comment_html)
|
comments_html.append(comment_html)
|
||||||
comments_html += existing_comments_html
|
comments_html += existing_comments_html
|
||||||
|
|
||||||
# Save HTML
|
# Save overall HTML
|
||||||
with open(os.path.join("html", html_file), encoding="utf-8") as f:
|
print("Saving HTML...")
|
||||||
html = f.read()
|
if page_size:
|
||||||
with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
|
length = max(len(posts_html), len(comments_html))
|
||||||
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
|
page_count = (length // page_size) + 1
|
||||||
with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
|
for i in range(page_count):
|
||||||
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
|
posts_on_page = posts_html[i*page_size:(i+1)*page_size]
|
||||||
html = html.replace("<!--posts-->", "\n".join(posts_html))
|
comments_on_page = comments_html[i*page_size:(i+1)*page_size]
|
||||||
html = html.replace("<!--comments-->", "\n".join(comments_html))
|
has_next = i < page_count - 1
|
||||||
with open(os.path.join(location, html_file), "w", encoding="utf-8") as f:
|
save_html(posts_on_page, comments_on_page, location, html_file, i, has_next)
|
||||||
f.write(html)
|
save_html(posts_html, comments_html, location, html_file, None, False)
|
||||||
|
|
||||||
|
|
67
utilities.py
67
utilities.py
@ -32,6 +32,39 @@ def make_client():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_previous(location, html_file):
|
||||||
|
html_files = [f for f in os.listdir(location) if f.endswith(".html")]
|
||||||
|
pattern = html_file.replace(".html", r"\.(\d+)?\.html")
|
||||||
|
matches = [re.match(pattern, f) for f in html_files]
|
||||||
|
matches = [m[0] for m in matches if m]
|
||||||
|
matches.sort(key=lambda x: int(x.split(".")[1]))
|
||||||
|
existing_ids = []
|
||||||
|
existing_posts_html = []
|
||||||
|
existing_comments_html = []
|
||||||
|
if html_file in html_files: matches.append(html_file)
|
||||||
|
for match in matches:
|
||||||
|
with open(os.path.join(location, match), encoding="utf-8") as f:
|
||||||
|
current_html = f.read()
|
||||||
|
for id in re.findall(r'id="(.+?)"', current_html):
|
||||||
|
if id not in existing_ids:
|
||||||
|
existing_ids.append(id)
|
||||||
|
posts = re.findall(
|
||||||
|
r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
|
||||||
|
current_html
|
||||||
|
)
|
||||||
|
comments = re.findall(
|
||||||
|
r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
|
||||||
|
current_html
|
||||||
|
)
|
||||||
|
for post in posts:
|
||||||
|
if post not in existing_posts_html:
|
||||||
|
existing_posts_html.append(post)
|
||||||
|
for comment in comments:
|
||||||
|
if comment not in existing_comments_html:
|
||||||
|
existing_comments_html.append(comment)
|
||||||
|
return existing_ids, existing_posts_html, existing_comments_html
|
||||||
|
|
||||||
|
|
||||||
def get_saved_posts(client):
|
def get_saved_posts(client):
|
||||||
"""Gets a list of posts that the user has saved."""
|
"""Gets a list of posts that the user has saved."""
|
||||||
|
|
||||||
@ -100,7 +133,10 @@ def save_media(post, location):
|
|||||||
# Can the media be obtained directly?
|
# Can the media be obtained directly?
|
||||||
if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS:
|
if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS:
|
||||||
filename = f"{readable_name}_{post.id}.{extension}"
|
filename = f"{readable_name}_{post.id}.{extension}"
|
||||||
response = requests.get(post.url)
|
try:
|
||||||
|
response = requests.get(post.url)
|
||||||
|
except:
|
||||||
|
return
|
||||||
media_type = response.headers.get("Content-Type", "")
|
media_type = response.headers.get("Content-Type", "")
|
||||||
if media_type.startswith("image") or media_type.startswith("video"):
|
if media_type.startswith("image") or media_type.startswith("video"):
|
||||||
with open(os.path.join(location, "media", filename), "wb") as f:
|
with open(os.path.join(location, "media", filename), "wb") as f:
|
||||||
@ -138,7 +174,9 @@ def save_media(post, location):
|
|||||||
direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}'
|
direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}'
|
||||||
direct_url = direct_url.replace("i.imgur.com", "imgur.com")
|
direct_url = direct_url.replace("i.imgur.com", "imgur.com")
|
||||||
direct_url = direct_url.replace("m.imgur.com", "imgur.com")
|
direct_url = direct_url.replace("m.imgur.com", "imgur.com")
|
||||||
response = requests.get(direct_url)
|
try:
|
||||||
|
response = requests.get(direct_url)
|
||||||
|
except: continue
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
filename = f"{readable_name}_{post.id}.{extension}"
|
filename = f"{readable_name}_{post.id}.{extension}"
|
||||||
with open(os.path.join(location, "media", filename), "wb") as f:
|
with open(os.path.join(location, "media", filename), "wb") as f:
|
||||||
@ -158,7 +196,8 @@ def save_media(post, location):
|
|||||||
try:
|
try:
|
||||||
ydl.download([url])
|
ydl.download([url])
|
||||||
except:
|
except:
|
||||||
pass
|
os.chdir(current)
|
||||||
|
return
|
||||||
for f in os.listdir(os.path.join(location, "media")):
|
for f in os.listdir(os.path.join(location, "media")):
|
||||||
if f.startswith(f"{readable_name}_{post.id}"):
|
if f.startswith(f"{readable_name}_{post.id}"):
|
||||||
return f
|
return f
|
||||||
@ -237,3 +276,25 @@ def get_comment_html(comment, children=True, op=None):
|
|||||||
children_html.append(get_comment_html(child, children=False, op=op))
|
children_html.append(get_comment_html(child, children=False, op=op))
|
||||||
html = html.replace("<!--children-->", "\n".join(children_html))
|
html = html.replace("<!--children-->", "\n".join(children_html))
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
def save_html(posts, comments, location, html_file, page, has_next):
|
||||||
|
with open(os.path.join("html", html_file), encoding="utf-8") as f:
|
||||||
|
html = f.read()
|
||||||
|
with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
|
||||||
|
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
|
||||||
|
with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
|
||||||
|
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
|
||||||
|
if page == 0 or page is None:
|
||||||
|
html = html.replace("Previous</a>", "</a>")
|
||||||
|
else:
|
||||||
|
html = html.replace(".p.html", f".{page-1}.html")
|
||||||
|
if not has_next or page is None:
|
||||||
|
html = html.replace("Next</a>", "</a>")
|
||||||
|
else:
|
||||||
|
html = html.replace(".n.html", f".{page+1}.html")
|
||||||
|
html = html.replace("<!--posts-->", "\n".join(posts))
|
||||||
|
html = html.replace("<!--comments-->", "\n".join(comments))
|
||||||
|
file_name = html_file if page is None else html_file.replace(".html", f".{page}.html")
|
||||||
|
with open(os.path.join(location, file_name), "w", encoding="utf-8") as f:
|
||||||
|
f.write(html)
|
||||||
|
Loading…
Reference in New Issue
Block a user