2020-12-30 23:59:55 +01:00
|
|
|
import os
|
|
|
|
import praw
|
2020-12-31 01:33:30 +01:00
|
|
|
import requests
|
2021-01-01 22:44:40 +01:00
|
|
|
from redvid import Downloader
|
2023-02-17 01:34:22 +01:00
|
|
|
import yt_dlp
|
2020-12-31 03:18:48 +01:00
|
|
|
import re
|
2020-12-30 23:59:55 +01:00
|
|
|
from datetime import datetime
|
2021-01-27 09:39:19 +01:00
|
|
|
|
2022-07-10 22:33:02 +02:00
|
|
|
try:
|
|
|
|
from logindata import REDDIT_USERNAME, REDDIT_PASSWORD
|
|
|
|
from logindata import REDDIT_CLIENT_ID, REDDIT_SECRET
|
2021-01-27 09:39:19 +01:00
|
|
|
except ImportError:
|
|
|
|
REDDIT_USERNAME = os.getenv("REDDIT_USERNAME")
|
|
|
|
REDDIT_PASSWORD = os.getenv("REDDIT_PASSWORD")
|
|
|
|
REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
|
|
|
|
REDDIT_SECRET = os.getenv("REDDIT_SECRET")
|
2020-12-30 23:59:55 +01:00
|
|
|
|
2021-01-02 23:33:47 +01:00
|
|
|
IMAGE_EXTENSIONS = ["gif", "gifv", "jpg", "jpeg", "png"]
|
2020-12-31 01:33:30 +01:00
|
|
|
VIDEO_EXTENSIONS = ["mp4"]
|
2021-01-01 22:44:40 +01:00
|
|
|
PLATFORMS = ["redgifs.com", "gfycat.com", "imgur.com", "youtube.com"]
|
2020-12-31 01:33:30 +01:00
|
|
|
|
2022-07-10 22:33:02 +02:00
|
|
|
|
2020-12-30 23:59:55 +01:00
|
|
|
def make_client():
|
2021-01-01 22:44:40 +01:00
|
|
|
"""Creates a PRAW client with the details in the secrets.py file."""
|
|
|
|
|
2023-08-26 15:56:54 +02:00
|
|
|
print(REDDIT_USERNAME)
|
|
|
|
|
2020-12-30 23:59:55 +01:00
|
|
|
return praw.Reddit(
|
|
|
|
username=REDDIT_USERNAME,
|
|
|
|
password=REDDIT_PASSWORD,
|
|
|
|
client_id=REDDIT_CLIENT_ID,
|
|
|
|
client_secret=REDDIT_SECRET,
|
|
|
|
user_agent="reddit-save",
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-06-18 19:23:42 +02:00
|
|
|
def get_previous(location, html_file):
|
|
|
|
html_files = [f for f in os.listdir(location) if f.endswith(".html")]
|
|
|
|
pattern = html_file.replace(".html", r"\.(\d+)?\.html")
|
|
|
|
matches = [re.match(pattern, f) for f in html_files]
|
|
|
|
matches = [m[0] for m in matches if m]
|
|
|
|
matches.sort(key=lambda x: int(x.split(".")[1]))
|
|
|
|
existing_ids = []
|
|
|
|
existing_posts_html = []
|
|
|
|
existing_comments_html = []
|
|
|
|
if html_file in html_files: matches.append(html_file)
|
|
|
|
for match in matches:
|
|
|
|
with open(os.path.join(location, match), encoding="utf-8") as f:
|
|
|
|
current_html = f.read()
|
|
|
|
for id in re.findall(r'id="(.+?)"', current_html):
|
|
|
|
if id not in existing_ids:
|
|
|
|
existing_ids.append(id)
|
|
|
|
posts = re.findall(
|
|
|
|
r'(<div class="post"[\S\n\t\v ]+?<!--postend--><\/div>)',
|
|
|
|
current_html
|
|
|
|
)
|
|
|
|
comments = re.findall(
|
|
|
|
r'(<div class="comment"[\S\n\t\v ]+?<!--commentend--><\/div>)',
|
|
|
|
current_html
|
|
|
|
)
|
|
|
|
for post in posts:
|
|
|
|
if post not in existing_posts_html:
|
|
|
|
existing_posts_html.append(post)
|
|
|
|
for comment in comments:
|
|
|
|
if comment not in existing_comments_html:
|
|
|
|
existing_comments_html.append(comment)
|
|
|
|
return existing_ids, existing_posts_html, existing_comments_html
|
|
|
|
|
|
|
|
|
2020-12-30 23:59:55 +01:00
|
|
|
def get_saved_posts(client):
|
2021-01-01 22:44:40 +01:00
|
|
|
"""Gets a list of posts that the user has saved."""
|
|
|
|
|
2020-12-31 03:18:48 +01:00
|
|
|
return [
|
2021-01-03 23:50:31 +01:00
|
|
|
saved for saved in client.user.me().saved(limit=None)
|
2020-12-31 03:18:48 +01:00
|
|
|
if saved.__class__.__name__ == "Submission"
|
|
|
|
]
|
2020-12-30 23:59:55 +01:00
|
|
|
|
|
|
|
|
2020-12-31 00:05:45 +01:00
|
|
|
def get_upvoted_posts(client):
|
2021-01-03 01:20:15 +01:00
|
|
|
"""Gets a list of posts that the user has upvoted."""
|
2021-01-01 22:44:40 +01:00
|
|
|
|
2020-12-31 03:18:48 +01:00
|
|
|
return [
|
|
|
|
upvoted for upvoted in client.user.me().upvoted(limit=None)
|
2021-01-02 23:33:47 +01:00
|
|
|
if upvoted.__class__.__name__ == "Submission"
|
2020-12-31 03:18:48 +01:00
|
|
|
]
|
2020-12-31 00:05:45 +01:00
|
|
|
|
|
|
|
|
2021-01-03 01:20:15 +01:00
|
|
|
def get_saved_comments(client):
|
|
|
|
"""Gets a list of comments that the user has saved."""
|
|
|
|
|
|
|
|
return [
|
|
|
|
saved for saved in client.user.me().saved(limit=None)
|
|
|
|
if saved.__class__.__name__ != "Submission"
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2023-08-26 15:56:54 +02:00
|
|
|
def get_user_posts(client, username):
|
|
|
|
"""Gets a list of posts that the user has made."""
|
|
|
|
|
|
|
|
return [
|
|
|
|
post for post in client.redditor(username).submissions.new(limit=None)
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_comments(client, username):
|
|
|
|
"""Gets a list of comments that the user has made."""
|
|
|
|
|
|
|
|
return [
|
|
|
|
comment for comment in client.redditor(username).comments.new(limit=None)
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2020-12-30 23:59:55 +01:00
|
|
|
def get_post_html(post):
|
2021-01-01 22:44:40 +01:00
|
|
|
"""Takes a post object and creates a HTML for it - but not including the
|
|
|
|
preview HTML."""
|
|
|
|
|
2022-07-10 22:33:02 +02:00
|
|
|
with open(os.path.join("html", "post-div.html"), encoding="utf-8") as f:
|
2020-12-30 23:59:55 +01:00
|
|
|
html = f.read()
|
|
|
|
dt = datetime.utcfromtimestamp(post.created_utc)
|
|
|
|
html = html.replace("<!--title-->", post.title)
|
2020-12-31 00:47:15 +01:00
|
|
|
html = html.replace("<!--subreddit-->", f"/r/{str(post.subreddit)}")
|
|
|
|
html = html.replace("<!--user-->", f"/u/{post.author.name}" if post.author else "[deleted]")
|
2021-01-02 23:46:36 +01:00
|
|
|
html = html.replace("<!--link-->", f"posts/{post.id}.html")
|
|
|
|
html = html.replace("<!--reddit-link-->", f"https://reddit.com{post.permalink}")
|
2020-12-31 00:54:04 +01:00
|
|
|
html = html.replace("<!--content-link-->", post.url)
|
2021-01-02 23:33:47 +01:00
|
|
|
html = html.replace("<!--id-->", post.id)
|
2021-01-03 23:12:49 +01:00
|
|
|
html = html.replace("<!--body-->", (post.selftext_html or "").replace(
|
|
|
|
'<a href="/r/', '<a href="https://reddit.com/r/'
|
|
|
|
))
|
2020-12-30 23:59:55 +01:00
|
|
|
html = html.replace("<!--timestamp-->", str(dt))
|
|
|
|
html = html.replace("<!--date-->", dt.strftime("%d %B, %Y"))
|
2020-12-31 01:33:30 +01:00
|
|
|
return html
|
|
|
|
|
|
|
|
|
2020-12-31 02:32:06 +01:00
|
|
|
def save_media(post, location):
|
2021-01-01 22:44:40 +01:00
|
|
|
"""Takes a post object and tries to download any image/video it might be
|
|
|
|
associated with. If it can, it will return the filename."""
|
|
|
|
url = post.url
|
|
|
|
stripped_url = url.split("?")[0]
|
2021-01-02 01:08:14 +01:00
|
|
|
if url.endswith(post.permalink): return None
|
2021-01-01 22:44:40 +01:00
|
|
|
|
|
|
|
# What is the key information?
|
|
|
|
extension = stripped_url.split(".")[-1].lower()
|
2020-12-31 03:58:31 +01:00
|
|
|
domain = ".".join(post.url.split("/")[2].split(".")[-2:])
|
2021-01-01 22:44:40 +01:00
|
|
|
readable_name = list(filter(bool, post.permalink.split("/")))[-1]
|
|
|
|
|
2023-12-26 17:34:22 +01:00
|
|
|
# Handle galleries
|
|
|
|
# When we saved a cross_post, we don't know if it is a gallery.
|
|
|
|
if hasattr(post, "gallery_data") or ("gallery" in url and hasattr(post, "crosspost_parent_list")):
|
|
|
|
|
|
|
|
if not hasattr(post, "gallery_data") and hasattr(post, "crosspost_parent_list"):
|
|
|
|
for crosspost in post.crosspost_parent_list:
|
|
|
|
if crosspost["gallery_data"] is not None:
|
|
|
|
# hard hack
|
|
|
|
post.gallery_data = crosspost["gallery_data"]
|
|
|
|
post.media_metadata = crosspost["media_metadata"]
|
|
|
|
break
|
|
|
|
if not hasattr(post, "gallery_data"): return None
|
|
|
|
|
|
|
|
images = [ ]
|
|
|
|
for item in sorted(post.gallery_data['items'], key=lambda x: x['id']):
|
|
|
|
media_id = item['media_id']
|
|
|
|
meta = post.media_metadata[media_id]
|
|
|
|
source = meta['s']
|
|
|
|
if meta['e'] == 'Image':
|
|
|
|
url = source['u']
|
|
|
|
elif meta['e'] == 'AnimatedImage':
|
|
|
|
url = source['gif']
|
|
|
|
else:
|
|
|
|
return None
|
|
|
|
stripped_url = url.split("?")[0]
|
|
|
|
extension = stripped_url.split(".")[-1].lower()
|
|
|
|
filename = f"{readable_name}_{post.id}_{media_id}.{extension}"
|
|
|
|
try:
|
|
|
|
response = requests.get(url)
|
|
|
|
with open(os.path.join(location, "media", filename), "wb") as f:
|
|
|
|
f.write(response.content)
|
|
|
|
images.append(filename)
|
|
|
|
except:
|
|
|
|
print(f"Failed to download {url}")
|
|
|
|
return images if len(images) > 0 else None
|
|
|
|
|
2021-01-02 01:08:14 +01:00
|
|
|
# If it's an imgur gallery, forget it
|
|
|
|
if domain == "imgur.com" and "gallery" in url: return None
|
|
|
|
|
2021-01-01 22:44:40 +01:00
|
|
|
# Can the media be obtained directly?
|
|
|
|
if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS:
|
2020-12-31 02:32:06 +01:00
|
|
|
filename = f"{readable_name}_{post.id}.{extension}"
|
2023-06-18 19:23:42 +02:00
|
|
|
try:
|
|
|
|
response = requests.get(post.url)
|
|
|
|
except:
|
|
|
|
return
|
2021-01-02 01:08:14 +01:00
|
|
|
media_type = response.headers.get("Content-Type", "")
|
|
|
|
if media_type.startswith("image") or media_type.startswith("video"):
|
|
|
|
with open(os.path.join(location, "media", filename), "wb") as f:
|
2021-01-01 22:44:40 +01:00
|
|
|
f.write(response.content)
|
2023-12-26 17:34:22 +01:00
|
|
|
return [ filename ]
|
2022-07-10 22:33:02 +02:00
|
|
|
|
2021-01-01 22:44:40 +01:00
|
|
|
# Is this a v.redd.it link?
|
|
|
|
if domain == "redd.it":
|
|
|
|
downloader = Downloader(max_q=True, log=False)
|
|
|
|
downloader.url = url
|
2021-01-03 21:56:15 +01:00
|
|
|
current = os.getcwd()
|
|
|
|
try:
|
|
|
|
name = downloader.download()
|
|
|
|
extension = name.split(".")[-1]
|
|
|
|
filename = f"{readable_name}_{post.id}.{extension}"
|
|
|
|
os.rename(name, os.path.join(location, "media", filename))
|
2023-12-26 17:34:22 +01:00
|
|
|
return [ filename ]
|
2021-01-03 21:56:15 +01:00
|
|
|
except:
|
|
|
|
os.chdir(current)
|
|
|
|
return None
|
2021-01-01 22:44:40 +01:00
|
|
|
|
|
|
|
# Is it a gfycat link that redirects? Update the URL if possible
|
|
|
|
if domain == "gfycat.com":
|
|
|
|
html = requests.get(post.url).content
|
|
|
|
if len(html) < 50000:
|
|
|
|
match = re.search(r"http([\dA-Za-z\+\:\/\.]+)\.mp4", html.decode())
|
|
|
|
if match:
|
|
|
|
url = match.group()
|
2022-07-10 22:33:02 +02:00
|
|
|
else:
|
|
|
|
return None
|
2021-01-01 23:51:40 +01:00
|
|
|
|
|
|
|
# Is this an imgur image?
|
2022-07-10 22:33:02 +02:00
|
|
|
if domain == "imgur.com" and extension != "gifv":
|
2021-01-01 23:51:40 +01:00
|
|
|
for extension in IMAGE_EXTENSIONS:
|
|
|
|
direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}'
|
2021-01-02 01:08:14 +01:00
|
|
|
direct_url = direct_url.replace("i.imgur.com", "imgur.com")
|
|
|
|
direct_url = direct_url.replace("m.imgur.com", "imgur.com")
|
2023-06-18 19:23:42 +02:00
|
|
|
try:
|
|
|
|
response = requests.get(direct_url)
|
|
|
|
except: continue
|
2021-01-01 23:51:40 +01:00
|
|
|
if response.status_code == 200:
|
|
|
|
filename = f"{readable_name}_{post.id}.{extension}"
|
|
|
|
with open(os.path.join(location, "media", filename), "wb") as f:
|
|
|
|
f.write(response.content)
|
2023-12-26 17:34:22 +01:00
|
|
|
return [ filename ]
|
2021-01-02 01:08:14 +01:00
|
|
|
|
2021-01-01 22:44:40 +01:00
|
|
|
# Try to use youtube_dl if it's one of the possible domains
|
|
|
|
if domain in PLATFORMS:
|
|
|
|
options = {
|
|
|
|
"nocheckcertificate": True, "quiet": True, "no_warnings": True,
|
2023-02-17 01:34:22 +01:00
|
|
|
"ignoreerrors": True, "no-progress": True,
|
2021-01-01 22:44:40 +01:00
|
|
|
"outtmpl": os.path.join(
|
2022-07-10 22:33:02 +02:00
|
|
|
location, "media", f"{readable_name}_{post.id}" + ".%(ext)s"
|
2021-01-01 22:44:40 +01:00
|
|
|
)
|
|
|
|
}
|
2023-02-17 01:34:22 +01:00
|
|
|
with yt_dlp.YoutubeDL(options) as ydl:
|
2021-01-01 22:44:40 +01:00
|
|
|
try:
|
|
|
|
ydl.download([url])
|
2021-01-02 01:08:14 +01:00
|
|
|
except:
|
2023-06-18 19:23:42 +02:00
|
|
|
os.chdir(current)
|
|
|
|
return
|
2021-01-01 22:44:40 +01:00
|
|
|
for f in os.listdir(os.path.join(location, "media")):
|
|
|
|
if f.startswith(f"{readable_name}_{post.id}"):
|
2023-12-26 17:34:22 +01:00
|
|
|
return [ f ]
|
2020-12-31 01:33:30 +01:00
|
|
|
|
|
|
|
|
|
|
|
def add_media_preview_to_html(post_html, media):
|
2021-01-01 22:44:40 +01:00
|
|
|
"""Takes post HTML and returns a modified version with the preview
|
|
|
|
inserted."""
|
2022-07-10 22:33:02 +02:00
|
|
|
|
2023-12-26 17:34:22 +01:00
|
|
|
media_html_list = []
|
|
|
|
|
|
|
|
for m in media:
|
|
|
|
extension = m.split(".")[-1]
|
|
|
|
location = "/".join(["media", m])
|
|
|
|
if extension in IMAGE_EXTENSIONS:
|
|
|
|
media_html_list.append(f'<img src="{location}">')
|
|
|
|
if extension in VIDEO_EXTENSIONS:
|
|
|
|
media_html_list.append(f'<video controls><source src="{location}"></video>')
|
|
|
|
return post_html.replace('<!--preview-->', ''.join([ f"<div class=\"preview\">{item}</div>" for item in media_html_list]))
|
2021-01-02 23:46:36 +01:00
|
|
|
|
|
|
|
|
|
|
|
def create_post_page_html(post, post_html):
|
2021-01-03 01:20:15 +01:00
|
|
|
"""Creates the HTML for a post's own page."""
|
|
|
|
|
2022-07-10 22:33:02 +02:00
|
|
|
with open(os.path.join("html", "post.html"), encoding="utf-8") as f:
|
2021-01-02 23:46:36 +01:00
|
|
|
html = f.read()
|
|
|
|
html = html.replace("<!--title-->", post.title)
|
|
|
|
html = html.replace("<!--post-->", post_html.replace("h2>", "h1>").replace(
|
|
|
|
'<img src="media/', '<img src="../media/'
|
2021-01-03 04:01:13 +01:00
|
|
|
).replace(
|
|
|
|
'<source src="media/', '<source src="../media/'
|
2021-01-02 23:46:36 +01:00
|
|
|
))
|
2021-01-03 22:58:41 +01:00
|
|
|
html = re.sub(r'<a href="posts(.+?)</a>', "", html)
|
2022-07-10 22:33:02 +02:00
|
|
|
with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
|
2021-01-02 23:46:36 +01:00
|
|
|
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
|
2022-07-10 22:33:02 +02:00
|
|
|
with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
|
2021-01-03 03:26:38 +01:00
|
|
|
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
|
2021-01-03 00:18:26 +01:00
|
|
|
comments_html = []
|
|
|
|
post.comments.replace_more(limit=0)
|
|
|
|
for comment in post.comments:
|
2021-01-03 23:50:31 +01:00
|
|
|
comments_html.append(get_comment_html(
|
|
|
|
comment, op=post.author.name if post.author else None
|
|
|
|
))
|
2021-01-03 00:18:26 +01:00
|
|
|
html = html.replace("<!--comments-->", "\n".join(comments_html))
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
2021-01-03 23:08:22 +01:00
|
|
|
def get_comment_html(comment, children=True, op=None):
|
2021-01-03 01:20:15 +01:00
|
|
|
"""Takes a post object and creates a HTML for it - it will get its children
|
|
|
|
too unless you specify otherwise."""
|
|
|
|
|
2022-07-10 22:33:02 +02:00
|
|
|
with open(os.path.join("html", "comment-div.html"), encoding="utf-8") as f:
|
2021-01-03 00:18:26 +01:00
|
|
|
html = f.read()
|
|
|
|
dt = datetime.utcfromtimestamp(comment.created_utc)
|
2021-01-03 23:08:22 +01:00
|
|
|
author = "[deleted]"
|
|
|
|
if comment.author:
|
|
|
|
if comment.author == op:
|
|
|
|
author = f'<span class="op">/u/{comment.author.name}</span>'
|
|
|
|
else:
|
|
|
|
author = f"/u/{comment.author.name}"
|
|
|
|
html = html.replace("<!--user-->", author)
|
2021-01-03 23:12:49 +01:00
|
|
|
html = html.replace("<!--body-->", (comment.body_html or "").replace(
|
|
|
|
'<a href="/r/', '<a href="https://reddit.com/r/'
|
|
|
|
))
|
2021-01-03 00:18:26 +01:00
|
|
|
html = html.replace("<!--score-->", str(comment.score))
|
|
|
|
html = html.replace("<!--link-->", f"https://reddit.com{comment.permalink}")
|
|
|
|
html = html.replace("<!--timestamp-->", str(dt))
|
2021-01-03 01:20:15 +01:00
|
|
|
html = html.replace("<!--id-->", comment.id)
|
2021-01-03 00:18:26 +01:00
|
|
|
html = html.replace("<!--date-->", dt.strftime("%H:%M - %d %B, %Y"))
|
2021-01-03 00:40:17 +01:00
|
|
|
if children:
|
|
|
|
children_html = []
|
|
|
|
for child in comment.replies:
|
2021-01-03 23:08:22 +01:00
|
|
|
children_html.append(get_comment_html(child, children=False, op=op))
|
2021-01-03 00:40:17 +01:00
|
|
|
html = html.replace("<!--children-->", "\n".join(children_html))
|
2022-07-10 22:33:02 +02:00
|
|
|
return html
|
2023-06-18 19:23:42 +02:00
|
|
|
|
|
|
|
|
2023-08-26 15:56:54 +02:00
|
|
|
def save_html(posts, comments, location, html_file, page, has_next, username=None):
|
|
|
|
if username:
|
|
|
|
with open(os.path.join("html", "username.html"), encoding="utf-8") as f:
|
|
|
|
html = f.read().replace("[username]", username)
|
|
|
|
else:
|
|
|
|
with open(os.path.join("html", html_file), encoding="utf-8") as f:
|
|
|
|
html = f.read()
|
2023-06-18 19:23:42 +02:00
|
|
|
with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
|
|
|
|
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
|
|
|
|
with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
|
|
|
|
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
|
|
|
|
if page == 0 or page is None:
|
|
|
|
html = html.replace("Previous</a>", "</a>")
|
|
|
|
else:
|
|
|
|
html = html.replace(".p.html", f".{page-1}.html")
|
|
|
|
if not has_next or page is None:
|
|
|
|
html = html.replace("Next</a>", "</a>")
|
|
|
|
else:
|
|
|
|
html = html.replace(".n.html", f".{page+1}.html")
|
|
|
|
html = html.replace("<!--posts-->", "\n".join(posts))
|
|
|
|
html = html.replace("<!--comments-->", "\n".join(comments))
|
|
|
|
file_name = html_file if page is None else html_file.replace(".html", f".{page}.html")
|
|
|
|
with open(os.path.join(location, file_name), "w", encoding="utf-8") as f:
|
|
|
|
f.write(html)
|