diff --git a/docs/configuration.rst b/docs/configuration.rst index 54d1c3096a..0daae9aa10 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2235,6 +2235,39 @@ Description | Leave ``SIZE`` empty to download the regular, small avatar format. +extractor.discord.embeds +------------------------ +Type + ``list`` of ``strings`` +Default + ``["image", "gifv", "video"]`` +Description + Selects which embed types to download from. + + Supported embed types are + ``image``, ``gifv``, ``video``, ``rich``, ``article``, ``link``. + + +extractor.discord.threads +------------------------- +Type + ``bool`` +Default + ``true`` +Description + Extract threads from Discord text channels. + + +extractor.discord.token +----------------------- +Type + ``string`` +Description + Discord Bot Token for API requests. + + You can follow `this guide `__ to get a token. + + extractor.[E621].metadata ------------------------- Type diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ea1a4c1ed4..6cc00ae738 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -211,6 +211,12 @@ Consider all listed sites to potentially be NSFW. Avatars, Backgrounds, Collections, Deviations, Favorites, Folders, Followed Users, Galleries, Gallery Searches, Journals, Scraps, Search Results, Sta.sh, Status Updates, Tag Searches, User Profiles, Watches OAuth + + Discord + https://discord.com/ + Channels, DMs, Servers + + Dynasty Reader https://dynasty-scans.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index b582c9970b..902cf719ef 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -45,6 +45,7 @@ "danbooru", "desktopography", "deviantart", + "discord", "dynastyscans", "e621", "erome", diff --git a/gallery_dl/extractor/discord.py b/gallery_dl/extractor/discord.py new file mode 100644 index 0000000000..c5f78d524c --- /dev/null +++ b/gallery_dl/extractor/discord.py @@ -0,0 +1,373 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://discord.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +BASE_PATTERN = r"(?:https?://)?discord\.com" + + +class DiscordExtractor(Extractor): + """Base class for Discord extractors""" + category = "discord" + root = "https://discord.com" + filename_fmt = "{message_id}_{num:>02}_{filename}.{extension}" + archive_fmt = "{message_id}_{num}" + + cdn_fmt = "https://cdn.discordapp.com/{}/{}/{}.png?size=4096" + + server_metadata = {} + server_channels_metadata = {} + + def _init(self): + self.token = self.config("token") + self.enabled_embeds = self.config("embeds", ["image", "gifv", "video"]) + self.enabled_threads = self.config("threads", True) + self.api = DiscordAPI(self) + + def extract_message_text(self, message): + text_content = [message["content"]] + + for embed in message["embeds"]: + if embed["type"] == "rich": + text_content.append(embed.get("author", {}).get("name", "")) + text_content.append(embed.get("title", "")) + text_content.append(embed.get("description", "")) + + for field in embed.get("fields", []): + text_content.append(field.get("name", "")) + text_content.append(field.get("value", "")) + + text_content.append(embed.get("footer", {}).get("text", "")) + + return "\n".join(t for t in text_content if t) + + if message.get("poll"): + text_content.append(message["poll"]["question"]["text"]) + for answer in message["poll"]["answers"]: + text_content.append(answer["poll_media"]["text"]) + + return "\n".join(t for t in text_content if t) + + def extract_message(self, message): + # https://discord.com/developers/docs/resources/message#message-object-message-types + if message["type"] in (0, 19, 21): + message_metadata = { + **self.server_metadata, + **self.server_channels_metadata[message["channel_id"]], + "author": message["author"]["username"], + "author_id": message["author"]["id"], + "author_files": [], + "message": self.extract_message_text(message), + "message_id": message["id"], + "date": text.parse_datetime( + message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z" + ), + "files": [] + } + + for icon_type, icon_path in ( + ("avatar", "avatars"), + ("banner", "banners") + ): + if message["author"].get(icon_type): + message_metadata["author_files"].append({ + "url": self.cdn_fmt.format( + icon_path, + message_metadata["author_id"], + message["author"][icon_type] + ), + "filename": icon_type, + "extension": "png", + }) + + for attachment in message["attachments"]: + message_metadata["files"].append({ + "url": attachment["url"], + "type": "attachment" + }) + + for embed in message["embeds"]: + if embed["type"] in self.enabled_embeds: + for field in ("video", "image", "thumbnail"): + url = embed.get(field, {}).get("proxy_url") + if url is not None: + message_metadata["files"].append({ + "url": url, + "type": "embed" + }) + break + + for num, file in enumerate(message_metadata["files"], start=1): + text.nameext_from_url(file["url"], file) + file["num"] = num + + yield Message.Directory, message_metadata + + for file in message_metadata["files"]: + yield Message.Url, file["url"], { + **message_metadata, + **file + } + + def extract_channel_text(self, channel_id): + for message in self.api.get_channel_messages(channel_id): + yield from self.extract_message(message) + + def extract_channel_threads(self, channel_id): + for thread in self.api.get_channel_threads(channel_id): + id = self.parse_channel(thread)["channel_id"] + yield from self.extract_channel_text(id) + + def extract_channel(self, channel_id, safe=False): + try: + if channel_id not in self.server_channels_metadata: + self.parse_channel(self.api.get_channel(channel_id)) + + channel_type = ( + self.server_channels_metadata[channel_id]["channel_type"] + ) + + # https://discord.com/developers/docs/resources/channel#channel-object-channel-types + if channel_type in (0, 5): + yield from self.extract_channel_text(channel_id) + if self.enabled_threads: + yield from self.extract_channel_threads(channel_id) + elif channel_type in (1, 3, 10, 11, 12): + yield from self.extract_channel_text(channel_id) + elif channel_type in (15, 16): + yield from self.extract_channel_threads(channel_id) + elif channel_type in (4,): + for channel in self.server_channels_metadata.copy().values(): + if channel["parent_id"] == channel_id: + yield from self.extract_channel( + channel["channel_id"], safe=True + ) + elif not safe: + raise exception.StopExtraction( + "This channel type is not supported." + ) + except exception.HttpError as e: + if not (e.status == 403 and safe): + raise + + def parse_channel(self, channel): + channel_metadata = { + "channel": channel.get("name", ""), + "channel_id": channel.get("id"), + "channel_type": channel.get("type"), + "channel_topic": channel.get("topic", ""), + "parent_id": channel.get("parent_id"), + "is_thread": "thread_metadata" in channel + } + + if channel_metadata["parent_id"] in self.server_channels_metadata: + parent_metadata = ( + self.server_channels_metadata[channel_metadata["parent_id"]] + ) + channel_metadata.update({ + "parent": parent_metadata["channel"], + "parent_type": parent_metadata["channel_type"] + }) + + if channel_metadata["channel_type"] in (1, 3): + channel_metadata.update({ + "channel": "DMs", + "recipients": ( + [user["username"] for user in channel["recipients"]] + ), + "recipients_id": ( + [user["id"] for user in channel["recipients"]] + ) + }) + + channel_id = channel_metadata["channel_id"] + + self.server_channels_metadata[channel_id] = channel_metadata + return self.server_channels_metadata[channel_id] + + def parse_server(self, server): + self.server_metadata = { + "server": server["name"], + "server_id": server["id"], + "server_files": [], + "owner_id": server["owner_id"] + } + + for icon_type, icon_path in ( + ("icon", "icons"), + ("banner", "banners"), + ("splash", "splashes"), + ("discovery_splash", "discovery-splashes") + ): + if server.get(icon_type): + self.server_metadata["server_files"].append({ + "url": self.cdn_fmt.format( + icon_path, + self.server_metadata["server_id"], + server[icon_type] + ), + "filename": icon_type, + "extension": "png", + }) + + return self.server_metadata + + def build_server_and_channels(self, server_id): + server = self.api.get_server(server_id) + self.parse_server(server) + + for channel in self.api.get_server_channels(server_id): + self.parse_channel(channel) + + +class DiscordChannelExtractor(DiscordExtractor): + subcategory = "channel" + directory_fmt = ( + "{category}", "{server_id}_{server}", "{channel_id}_{channel}" + ) + pattern = BASE_PATTERN + r"/channels/(\d+)/(\d+)(?:/threads/(\d+))?" + example = ( + "https://discord.com/channels/302094807046684672/1306705919916249098" + ) + + def items(self): + server_id = self.groups[0] + channel_id = self.groups[2] or self.groups[1] + + self.build_server_and_channels(server_id) + + yield from self.extract_channel(channel_id) + + +class DiscordServerExtractor(DiscordExtractor): + subcategory = "server" + directory_fmt = ( + "{category}", "{server_id}_{server}", "{channel_id}_{channel}" + ) + pattern = BASE_PATTERN + r"/channels/(\d+)/?$" + example = ( + "https://discord.com/channels/302094807046684672" + ) + + def items(self): + server_id = self.groups[0] + + self.build_server_and_channels(server_id) + + for channel in self.server_channels_metadata.copy().values(): + if channel["channel_type"] in (0, 5, 15, 16): + yield from self.extract_channel( + channel["channel_id"], safe=True + ) + + +class DiscordDirectMessagesExtractor(DiscordExtractor): + subcategory = "direct-messages" + directory_fmt = ( + "{category}", "{subcategory}", "{channel_id}_{recipients:J,}" + ) + pattern = BASE_PATTERN + r"/channels/@me/(\d+)/?$" + example = ( + "https://discord.com/channels/@me/302094807046684672" + ) + + def items(self): + channel_id = self.groups[0] + + yield from self.extract_channel(channel_id) + + +class DiscordAPI(): + """Interface for the Discord API v10 + + https://discord.com/developers/docs/reference + """ + + def __init__(self, extractor): + self.extractor = extractor + self.token = extractor.token + self.root = extractor.root + "/api/v10" + + def get_server(self, server_id): + """Get server information""" + return self._call("/guilds/" + server_id) + + def get_server_channels(self, server_id): + """Get server channels""" + return self._call("/guilds/" + server_id + "/channels") + + def get_channel(self, channel_id): + """Get channel information""" + return self._call("/channels/" + channel_id) + + def get_channel_threads(self, channel_id): + """Get channel threads""" + THREADS_BATCH = 25 + + def _method(offset): + return self._call( + "/channels/" + channel_id + "/threads/search?" + "sort_by=last_message_time&sort_order=desc" + "&limit=" + str(THREADS_BATCH) + "&offset=" + str(offset) + )["threads"] + + return self._pagination(_method, THREADS_BATCH) + + def get_channel_messages(self, channel_id): + """Get channel messages""" + MESSAGES_BATCH = 100 + + before = None + + def _method(_): + nonlocal before + messages = self._call( + "/channels/" + channel_id + + "/messages?limit=" + str(MESSAGES_BATCH) + + (("&before=" + before) if before else "") + ) + before = messages[-1]["id"] + return messages + + return self._pagination(_method, MESSAGES_BATCH) + + def _call(self, endpoint): + url = self.root + endpoint + try: + response = self.extractor.request(url, headers={ + "Authorization": self.token, + }) + except exception.HttpError as e: + if e.status == 401: + self._raise_invalid_token() + raise + return response.json() + + def _pagination(self, method, batch): + offset = 0 + while True: + data = method(offset) + yield from data + if len(data) < batch: + return + offset += len(data) + + @staticmethod + def _raise_invalid_token(): + raise exception.AuthenticationError("""Invalid or missing token. +Please provide a valid token following these instructions: + +1) Open Discord in your browser (https://discord.com/app); +2) Open your browser's Developer Tools (F12) and switch to the Network panel; +3) Reload the page and select any request going to https://discord.com/api/...; +4) In the "Headers" tab, look for an entry beginning with "Authorization: "; +5) Right-click the entry and click "Copy Value"; +6) Paste the token in your configuration file under "extractor.discord.token", +or run this command with the -o "token=[your token]" argument.""") diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 3ef7e9e0e0..0601af8d19 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -43,6 +43,7 @@ "coomerparty" : "Coomer", "deltaporno" : "DeltaPorno", "deviantart" : "DeviantArt", + "discord" : "Discord", "drawfriends" : "Draw Friends", "dynastyscans" : "Dynasty Reader", "e621" : "e621", @@ -234,6 +235,9 @@ "status": "Status Updates", "watch-posts": "", }, + "discord": { + "direct-messages": "DMs" + }, "fanbox": { "supporting": "Supported User Feed", "redirect" : "Pixiv Redirects", diff --git a/test/results/discord.py b/test/results/discord.py new file mode 100644 index 0000000000..053257fe4f --- /dev/null +++ b/test/results/discord.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import discord +# import datetime + + +__tests__ = ( +{ + "#url" : "https://discord.com/channels/302094807046684672/1306705919916249098", + "#category": ("", "discord", "channel"), + "#class" : discord.DiscordChannelExtractor, +# # access token & access to minecraft server required for this test (REMEMBER TO REMOVE TOKEN BEFORE COMMITTING) +# "#range" : "1-2", +# "#count" : 2, +# "#options" : {"token": ""}, +# +# "#server" : "MINECRAFT", +# "#server_id" : "302094807046684672", +# "#server_files" : list, +# "#owner_id" : "827254075857829920", +# "#channel" : str, +# "#channel_id" : str, +# "#channel_type" : 11, +# "#channel_topic": str, +# "#parent" : "challenges", +# "#parent_id" : "1306705919916249098", +# "#parent_type" : 15, +# "#is_thread" : True, +# +# "author" : str, +# "author_id" : str, +# "author_files": list, +# "message" : str, +# "message_id" : str, +# "filename" : str, +# "extension" : str, +# "type" : str, +# "date" : datetime.datetime, +# "files" : list, +# "filename" : str, +# "extension" : str, +# "num" : int, +}, + +{ + "#url" : "https://discord.com/channels/302094807046684672/1306705919916249098/threads/1306706528786583623", + "#category": ("", "discord", "channel"), + "#class" : discord.DiscordChannelExtractor, +}, + +{ + "#url" : "https://discord.com/channels/302094807046684672", + "#category": ("", "discord", "server"), + "#class" : discord.DiscordServerExtractor +}, + +{ + "#url" : "https://discord.com/channels/@me/302094807046684672", + "#category": ("", "discord", "direct-messages"), + "#class" : discord.DiscordDirectMessagesExtractor +} + +)