reddit-save/utilities.py

240 lines
8.7 KiB
Python
Raw Normal View History

2020-12-30 23:59:55 +01:00
import os
import praw
2020-12-31 01:33:30 +01:00
import requests
2021-01-01 22:44:40 +01:00
from redvid import Downloader
2020-12-31 02:32:06 +01:00
import youtube_dl
2020-12-31 03:18:48 +01:00
import re
2020-12-30 23:59:55 +01:00
from datetime import datetime
try:
from logindata import REDDIT_USERNAME, REDDIT_PASSWORD
from logindata import REDDIT_CLIENT_ID, REDDIT_SECRET
except ImportError:
REDDIT_USERNAME = os.getenv("REDDIT_USERNAME")
REDDIT_PASSWORD = os.getenv("REDDIT_PASSWORD")
REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET = os.getenv("REDDIT_SECRET")
2020-12-30 23:59:55 +01:00
2021-01-02 23:33:47 +01:00
IMAGE_EXTENSIONS = ["gif", "gifv", "jpg", "jpeg", "png"]
2020-12-31 01:33:30 +01:00
VIDEO_EXTENSIONS = ["mp4"]
2021-01-01 22:44:40 +01:00
PLATFORMS = ["redgifs.com", "gfycat.com", "imgur.com", "youtube.com"]
2020-12-31 01:33:30 +01:00
2020-12-30 23:59:55 +01:00
def make_client():
2021-01-01 22:44:40 +01:00
"""Creates a PRAW client with the details in the secrets.py file."""
2020-12-30 23:59:55 +01:00
return praw.Reddit(
username=REDDIT_USERNAME,
password=REDDIT_PASSWORD,
client_id=REDDIT_CLIENT_ID,
client_secret=REDDIT_SECRET,
user_agent="reddit-save",
)
def get_saved_posts(client):
2021-01-01 22:44:40 +01:00
"""Gets a list of posts that the user has saved."""
2020-12-31 03:18:48 +01:00
return [
2021-01-03 23:50:31 +01:00
saved for saved in client.user.me().saved(limit=None)
2020-12-31 03:18:48 +01:00
if saved.__class__.__name__ == "Submission"
]
2020-12-30 23:59:55 +01:00
2020-12-31 00:05:45 +01:00
def get_upvoted_posts(client):
2021-01-03 01:20:15 +01:00
"""Gets a list of posts that the user has upvoted."""
2021-01-01 22:44:40 +01:00
2020-12-31 03:18:48 +01:00
return [
upvoted for upvoted in client.user.me().upvoted(limit=None)
2021-01-02 23:33:47 +01:00
if upvoted.__class__.__name__ == "Submission"
2020-12-31 03:18:48 +01:00
]
2020-12-31 00:05:45 +01:00
2021-01-03 01:20:15 +01:00
def get_saved_comments(client):
"""Gets a list of comments that the user has saved."""
return [
saved for saved in client.user.me().saved(limit=None)
if saved.__class__.__name__ != "Submission"
]
2020-12-30 23:59:55 +01:00
def get_post_html(post):
2021-01-01 22:44:40 +01:00
"""Takes a post object and creates a HTML for it - but not including the
preview HTML."""
with open(os.path.join("html", "post-div.html"), encoding="utf-8") as f:
2020-12-30 23:59:55 +01:00
html = f.read()
dt = datetime.utcfromtimestamp(post.created_utc)
html = html.replace("<!--title-->", post.title)
2020-12-31 00:47:15 +01:00
html = html.replace("<!--subreddit-->", f"/r/{str(post.subreddit)}")
html = html.replace("<!--user-->", f"/u/{post.author.name}" if post.author else "[deleted]")
2021-01-02 23:46:36 +01:00
html = html.replace("<!--link-->", f"posts/{post.id}.html")
html = html.replace("<!--reddit-link-->", f"https://reddit.com{post.permalink}")
2020-12-31 00:54:04 +01:00
html = html.replace("<!--content-link-->", post.url)
2021-01-02 23:33:47 +01:00
html = html.replace("<!--id-->", post.id)
2021-01-03 23:12:49 +01:00
html = html.replace("<!--body-->", (post.selftext_html or "").replace(
'<a href="/r/', '<a href="https://reddit.com/r/'
))
2020-12-30 23:59:55 +01:00
html = html.replace("<!--timestamp-->", str(dt))
html = html.replace("<!--date-->", dt.strftime("%d %B, %Y"))
2020-12-31 01:33:30 +01:00
return html
2020-12-31 02:32:06 +01:00
def save_media(post, location):
2021-01-01 22:44:40 +01:00
"""Takes a post object and tries to download any image/video it might be
associated with. If it can, it will return the filename."""
url = post.url
stripped_url = url.split("?")[0]
2021-01-02 01:08:14 +01:00
if url.endswith(post.permalink): return None
2021-01-01 22:44:40 +01:00
# What is the key information?
extension = stripped_url.split(".")[-1].lower()
2020-12-31 03:58:31 +01:00
domain = ".".join(post.url.split("/")[2].split(".")[-2:])
2021-01-01 22:44:40 +01:00
readable_name = list(filter(bool, post.permalink.split("/")))[-1]
2021-01-02 01:08:14 +01:00
# If it's an imgur gallery, forget it
if domain == "imgur.com" and "gallery" in url: return None
2021-01-01 22:44:40 +01:00
# Can the media be obtained directly?
if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS:
2020-12-31 02:32:06 +01:00
filename = f"{readable_name}_{post.id}.{extension}"
2021-01-02 01:08:14 +01:00
response = requests.get(post.url)
media_type = response.headers.get("Content-Type", "")
if media_type.startswith("image") or media_type.startswith("video"):
with open(os.path.join(location, "media", filename), "wb") as f:
2021-01-01 22:44:40 +01:00
f.write(response.content)
return filename
2021-01-01 22:44:40 +01:00
# Is this a v.redd.it link?
if domain == "redd.it":
downloader = Downloader(max_q=True, log=False)
downloader.url = url
2021-01-03 21:56:15 +01:00
current = os.getcwd()
try:
name = downloader.download()
extension = name.split(".")[-1]
filename = f"{readable_name}_{post.id}.{extension}"
os.rename(name, os.path.join(location, "media", filename))
return filename
except:
os.chdir(current)
return None
2021-01-01 22:44:40 +01:00
# Is it a gfycat link that redirects? Update the URL if possible
if domain == "gfycat.com":
html = requests.get(post.url).content
if len(html) < 50000:
match = re.search(r"http([\dA-Za-z\+\:\/\.]+)\.mp4", html.decode())
if match:
url = match.group()
else:
return None
2021-01-01 23:51:40 +01:00
# Is this an imgur image?
if domain == "imgur.com" and extension != "gifv":
2021-01-01 23:51:40 +01:00
for extension in IMAGE_EXTENSIONS:
direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}'
2021-01-02 01:08:14 +01:00
direct_url = direct_url.replace("i.imgur.com", "imgur.com")
direct_url = direct_url.replace("m.imgur.com", "imgur.com")
2021-01-01 23:51:40 +01:00
response = requests.get(direct_url)
if response.status_code == 200:
filename = f"{readable_name}_{post.id}.{extension}"
with open(os.path.join(location, "media", filename), "wb") as f:
f.write(response.content)
return filename
2021-01-02 01:08:14 +01:00
2021-01-01 22:44:40 +01:00
# Try to use youtube_dl if it's one of the possible domains
if domain in PLATFORMS:
options = {
"nocheckcertificate": True, "quiet": True, "no_warnings": True,
"ignoreerrors": True,
"outtmpl": os.path.join(
location, "media", f"{readable_name}_{post.id}" + ".%(ext)s"
2021-01-01 22:44:40 +01:00
)
}
with youtube_dl.YoutubeDL(options) as ydl:
try:
ydl.download([url])
2021-01-02 01:08:14 +01:00
except:
pass
2021-01-01 22:44:40 +01:00
for f in os.listdir(os.path.join(location, "media")):
if f.startswith(f"{readable_name}_{post.id}"):
return f
2020-12-31 01:33:30 +01:00
def add_media_preview_to_html(post_html, media):
2021-01-01 22:44:40 +01:00
"""Takes post HTML and returns a modified version with the preview
inserted."""
2020-12-31 02:32:06 +01:00
extension = media.split(".")[-1]
location = "/".join(["media", media])
2020-12-31 01:33:30 +01:00
if extension in IMAGE_EXTENSIONS:
return post_html.replace(
"<!--preview-->",
f'<img src="{location}">'
)
2020-12-31 02:32:06 +01:00
if extension in VIDEO_EXTENSIONS:
return post_html.replace(
"<!--preview-->",
f'<video controls><source src="{location}"></video>'
2020-12-31 03:58:31 +01:00
)
2021-01-02 23:46:36 +01:00
return post_html
def create_post_page_html(post, post_html):
2021-01-03 01:20:15 +01:00
"""Creates the HTML for a post's own page."""
with open(os.path.join("html", "post.html"), encoding="utf-8") as f:
2021-01-02 23:46:36 +01:00
html = f.read()
html = html.replace("<!--title-->", post.title)
html = html.replace("<!--post-->", post_html.replace("h2>", "h1>").replace(
'<img src="media/', '<img src="../media/'
2021-01-03 04:01:13 +01:00
).replace(
'<source src="media/', '<source src="../media/'
2021-01-02 23:46:36 +01:00
))
2021-01-03 22:58:41 +01:00
html = re.sub(r'<a href="posts(.+?)</a>', "", html)
with open(os.path.join("html", "style.css"), encoding="utf-8") as f:
2021-01-02 23:46:36 +01:00
html = html.replace("<style></style>", f"<style>\n{f.read()}\n</style>")
with open(os.path.join("html", "main.js"), encoding="utf-8") as f:
2021-01-03 03:26:38 +01:00
html = html.replace("<script></script>", f"<script>\n{f.read()}\n</script>")
2021-01-03 00:18:26 +01:00
comments_html = []
post.comments.replace_more(limit=0)
for comment in post.comments:
2021-01-03 23:50:31 +01:00
comments_html.append(get_comment_html(
comment, op=post.author.name if post.author else None
))
2021-01-03 00:18:26 +01:00
html = html.replace("<!--comments-->", "\n".join(comments_html))
return html
2021-01-03 23:08:22 +01:00
def get_comment_html(comment, children=True, op=None):
2021-01-03 01:20:15 +01:00
"""Takes a post object and creates a HTML for it - it will get its children
too unless you specify otherwise."""
with open(os.path.join("html", "comment-div.html"), encoding="utf-8") as f:
2021-01-03 00:18:26 +01:00
html = f.read()
dt = datetime.utcfromtimestamp(comment.created_utc)
2021-01-03 23:08:22 +01:00
author = "[deleted]"
if comment.author:
if comment.author == op:
author = f'<span class="op">/u/{comment.author.name}</span>'
else:
author = f"/u/{comment.author.name}"
html = html.replace("<!--user-->", author)
2021-01-03 23:12:49 +01:00
html = html.replace("<!--body-->", (comment.body_html or "").replace(
'<a href="/r/', '<a href="https://reddit.com/r/'
))
2021-01-03 00:18:26 +01:00
html = html.replace("<!--score-->", str(comment.score))
html = html.replace("<!--link-->", f"https://reddit.com{comment.permalink}")
html = html.replace("<!--timestamp-->", str(dt))
2021-01-03 01:20:15 +01:00
html = html.replace("<!--id-->", comment.id)
2021-01-03 00:18:26 +01:00
html = html.replace("<!--date-->", dt.strftime("%H:%M - %d %B, %Y"))
2021-01-03 00:40:17 +01:00
if children:
children_html = []
for child in comment.replies:
2021-01-03 23:08:22 +01:00
children_html.append(get_comment_html(child, children=False, op=op))
2021-01-03 00:40:17 +01:00
html = html.replace("<!--children-->", "\n".join(children_html))
return html