reddit-save/save.py

#!/usr/bin/env python 

import argparse
import os
import re
from tqdm import tqdm
from utilities import *

# Get arguments
parser = argparse.ArgumentParser(description="Save reddit posts to file.")
parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.")
if os.getenv("DOCKER", "0") != "1":
    parser.add_argument("location", type=str, nargs=1, help="The path to save to.")
# Optional page size argument
parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.")
args = parser.parse_args()
mode = args.mode[0]
page_size = args.page_size[0]
location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0]

# Is location specified a directory?
if not os.path.isdir(location):
    print(location, "is not a directory")

# Make a client object
client = make_client()

# Saved posts or upvoted posts?
if mode == "saved":
    html_file = "saved.html"
    get_posts = get_saved_posts
    get_comments = get_saved_comments
else:
    html_file = "upvoted.html"
    get_posts = get_upvoted_posts
    get_comments = lambda client: []

# Make directory for media and posts
if not os.path.exists(os.path.join(location, "media")):
    os.mkdir(os.path.join(location, "media"))
if not os.path.exists(os.path.join(location, "posts")):
    os.mkdir(os.path.join(location, "posts"))

# Get files to search through
print("Getting previously saved posts and comments...")
existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file)
print(len(existing_posts_html), "previous posts.")
print(len(existing_comments_html), "previous comments.")

# Get posts HTML
posts_html = []
posts = [p for p in get_posts(client) if p.id not in existing_ids]
if not posts:
    print("No new posts")
else:
    for post in tqdm(posts):
        post_html = get_post_html(post)
        media = save_media(post, location)
        if media:
            post_html = add_media_preview_to_html(post_html, media)
        posts_html.append(post_html)
        page_html = create_post_page_html(post, post_html)
        with open(os.path.join(location, "posts", f"{post.id}.html"), "w", encoding="utf-8") as f:
            f.write(page_html)
posts_html += existing_posts_html

# Get comments HTML
comments_html = []
comments = [c for c in get_comments(client) if c.id not in existing_ids]
if not comments:
    print("No new comments")
else:
    for comment in tqdm(comments):
        comment_html = get_comment_html(comment)
        media = save_media(post, location)
        comments_html.append(comment_html)
comments_html += existing_comments_html

# Save overall HTML
print("Saving HTML...")
if page_size:
    length = max(len(posts_html), len(comments_html))
    page_count = (length // page_size) + 1
    for i in range(page_count):
        posts_on_page = posts_html[i*page_size:(i+1)*page_size]
        comments_on_page = comments_html[i*page_size:(i+1)*page_size]
        has_next = i < page_count - 1
        save_html(posts_on_page, comments_on_page, location, html_file, i, has_next)
save_html(posts_html, comments_html, location, html_file, None, False)
Start save script 2020-12-30 23:28:50 +01:00			`#!/usr/bin/env python`

			`import argparse`
			`import os`
Don't do previously saved items 2021-01-02 23:33:47 +01:00			`import re`
Handle gfycat links 2020-12-31 03:18:48 +01:00			`from tqdm import tqdm`
Get saved posts 2020-12-30 23:59:55 +01:00			`from utilities import *`
Start save script 2020-12-30 23:28:50 +01:00
			`# Get arguments`
			`parser = argparse.ArgumentParser(description="Save reddit posts to file.")`
			`parser.add_argument("mode", type=str, nargs=1, choices=["saved", "upvoted"], help="The file to convert.")`
Add docker support Replaced secrets.py with docker environment variables. Set location as a default "./archive/" while inside docker container. 2021-01-27 09:39:19 +01:00			`if os.getenv("DOCKER", "0") != "1":`
			`parser.add_argument("location", type=str, nargs=1, help="The path to save to.")`
Add option to save to multiple smaller HTML files 2023-06-18 19:23:42 +02:00			`# Optional page size argument`
			`parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.")`
Start save script 2020-12-30 23:28:50 +01:00			`args = parser.parse_args()`
			`mode = args.mode[0]`
Add option to save to multiple smaller HTML files 2023-06-18 19:23:42 +02:00			`page_size = args.page_size[0]`
Add docker support Replaced secrets.py with docker environment variables. Set location as a default "./archive/" while inside docker container. 2021-01-27 09:39:19 +01:00			`location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0]`
Start save script 2020-12-30 23:28:50 +01:00
			`# Is location specified a directory?`
			`if not os.path.isdir(location):`
Get saved posts 2020-12-30 23:59:55 +01:00			`print(location, "is not a directory")`

			`# Make a client object`
			`client = make_client()`

Display image previews 2020-12-31 01:33:30 +01:00			`# Saved posts or upvoted posts?`
Get upvoted posts 2020-12-31 00:05:45 +01:00			`if mode == "saved":`
			`html_file = "saved.html"`
			`get_posts = get_saved_posts`
Saved comments 2021-01-03 01:20:15 +01:00			`get_comments = get_saved_comments`
Get upvoted posts 2020-12-31 00:05:45 +01:00			`else:`
			`html_file = "upvoted.html"`
			`get_posts = get_upvoted_posts`
Saved comments 2021-01-03 01:20:15 +01:00			`get_comments = lambda client: []`
Get upvoted posts 2020-12-31 00:05:45 +01:00
Post pages 2021-01-02 23:46:36 +01:00			`# Make directory for media and posts`
Display image previews 2020-12-31 01:33:30 +01:00			`if not os.path.exists(os.path.join(location, "media")):`
			`os.mkdir(os.path.join(location, "media"))`
Post pages 2021-01-02 23:46:36 +01:00			`if not os.path.exists(os.path.join(location, "posts")):`
			`os.mkdir(os.path.join(location, "posts"))`
Display image previews 2020-12-31 01:33:30 +01:00
Add option to save to multiple smaller HTML files 2023-06-18 19:23:42 +02:00			`# Get files to search through`
			`print("Getting previously saved posts and comments...")`
			`existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file)`
Extra info statements 2023-06-18 19:29:13 +02:00			`print(len(existing_posts_html), "previous posts.")`
			`print(len(existing_comments_html), "previous comments.")`
Get saved posts 2020-12-30 23:59:55 +01:00
Don't do previously saved items 2021-01-02 23:33:47 +01:00			`# Get posts HTML`
Saved comments 2021-01-03 01:20:15 +01:00			`posts_html = []`
Add option to save to multiple smaller HTML files 2023-06-18 19:23:42 +02:00			`posts = [p for p in get_posts(client) if p.id not in existing_ids]`
Don't do previously saved items 2021-01-02 23:33:47 +01:00			`if not posts:`
Extra info statements 2023-06-18 19:29:13 +02:00			`print("No new posts")`
Don't do previously saved items 2021-01-02 23:33:47 +01:00			`else:`
			`for post in tqdm(posts):`
			`post_html = get_post_html(post)`
			`media = save_media(post, location)`
			`if media:`
			`post_html = add_media_preview_to_html(post_html, media)`
			`posts_html.append(post_html)`
Post pages 2021-01-02 23:46:36 +01:00			`page_html = create_post_page_html(post, post_html)`
Fixed ImportError: cannot import name 'token_urlsafe' from 'secrets' #7 Fixed UnicodeEncodeError: 'charmap' codec can't encode characters in position 28919-28920: character maps to <undefined> 2022-07-10 22:33:02 +02:00			`with open(os.path.join(location, "posts", f"{post.id}.html"), "w", encoding="utf-8") as f:`
Post pages 2021-01-02 23:46:36 +01:00			`f.write(page_html)`
Saved comments 2021-01-03 01:20:15 +01:00			`posts_html += existing_posts_html`

			`# Get comments HTML`
			`comments_html = []`
Add option to save to multiple smaller HTML files 2023-06-18 19:23:42 +02:00			`comments = [c for c in get_comments(client) if c.id not in existing_ids]`
Saved comments 2021-01-03 01:20:15 +01:00			`if not comments:`
Extra info statements 2023-06-18 19:29:13 +02:00			`print("No new comments")`
Saved comments 2021-01-03 01:20:15 +01:00			`else:`
			`for comment in tqdm(comments):`
			`comment_html = get_comment_html(comment)`
			`media = save_media(post, location)`
			`comments_html.append(comment_html)`
			`comments_html += existing_comments_html`
Get saved posts 2020-12-30 23:59:55 +01:00
Add option to save to multiple smaller HTML files 2023-06-18 19:23:42 +02:00			`# Save overall HTML`
			`print("Saving HTML...")`
			`if page_size:`
			`length = max(len(posts_html), len(comments_html))`
			`page_count = (length // page_size) + 1`
			`for i in range(page_count):`
			`posts_on_page = posts_html[ipage_size:(i+1)page_size]`
			`comments_on_page = comments_html[ipage_size:(i+1)page_size]`
			`has_next = i < page_count - 1`
			`save_html(posts_on_page, comments_on_page, location, html_file, i, has_next)`
			`save_html(posts_html, comments_html, location, html_file, None, False)`