Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Threads] add bridge #3805

Merged
merged 3 commits into from
Nov 21, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions bridges/ThreadsBridge.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
<?php

class ThreadsBridge extends BridgeAbstract
{
const NAME = 'Threads';
const URI = 'https://www.threads.net/';
const DESCRIPTION = 'Say more with Threads &#x2014; Instagram&#039;s new text app.';
const MAINTAINER = 'mdemoss';
const CACHE_TIMEOUT = 3600;

const PARAMETERS = [
'By username' => [
'u' => [
'name' => 'username',
'required' => true,
'exampleValue' => 'zuck',
'title' => 'Insert a user name'
],
'limit' => [
'name' => 'Limit',
'type' => 'number',
'required' => false,
'title' => 'Specify number of posts to fetch',
'defaultValue' => 5
]
]
];

protected $feedName = self::NAME;
public function getName()
{
return $this->feedName;
}

public function detectParameters($url)
{
// By username
$regex = '/^(https?:\/\/)?(www\.)?threads\.net\/(@)?([^\/?\n]+)/';
if (preg_match($regex, $url, $matches) > 0) {
$params['context'] = 'By username';
$params['u'] = urldecode($matches[3]);
return $params;
}
return null;
}

public function getURI()
{
return self::URI . '@' . $this->getInput('u');
}

// https://stackoverflow.com/a/3975706/421140
// Found this in FlaschenpostBridge, modified to return an array and take an object.
private function recursiveFind($haystack, $needle)
{
$found = [];
$iterator = new \RecursiveArrayIterator($haystack);
$recursive = new \RecursiveIteratorIterator(
$iterator,
\RecursiveIteratorIterator::SELF_FIRST
);
foreach ($recursive as $key => $value) {
if ($key === $needle) {
$found[] = $value;
}
}
return $found;
}

public function collectData()
{
$html = getSimpleHTMLDOMCached($this->getURI(), static::CACHE_TIMEOUT);
Debug::log(sprintf('Fetched: %s', $this->getURI()));
$jsonBlobs = $html->find('script[type="application/json"]');
Debug::log(sprintf('%d JSON blobs found.', count($jsonBlobs)));
$gatheredCodes = [];
$limit = $this->getInput('limit');
foreach ($jsonBlobs as $jsonBlob) {
// The structure of the JSON document is likely to change, but we're looking for a "code" inside a "post"
foreach ($this->recursiveFind($this->recursiveFind(json_decode($jsonBlob->innertext), 'post'), 'code') as $candidateCode) {
// code should be like CzZk4-USq1O or Cy3m1VnRiwP or Cywjyrdv9T6 or CzZk4-USq1O
if (grapheme_strlen($candidateCode) == 11 and !in_array($candidateCode, $gatheredCodes)) {
$gatheredCodes[] = $candidateCode;
if (count($gatheredCodes) >= $limit) {
break 2;
}
}
}
}
Debug::log(sprintf('Candidate codes found in JSON in script tags: %s', print_r($gatheredCodes, true)));

$this->feedName = html_entity_decode($html->find('meta[property=og:title]', 0)->content);
// todo: meta[property=og:description] could populate the feed description

foreach ($gatheredCodes as $postCode) {
$item = [];
// post URL is like: https://www.threads.net/@zuck/post/Czrr520PZfh
$item['uri'] = $this->getURI() . '/post/' . $postCode;
$articleHtml = getSimpleHTMLDOMCached($item['uri'], 15778800); // cache time: six months

// Relying on meta tags ought to be more reliable.
if ($articleHtml->find('meta[property=og:type]', 0)->content != 'article') {
continue;
}
$item['title'] = $articleHtml->find('meta[property=og:description]', 0)->content;
$item['content'] = $articleHtml->find('meta[property=og:description]', 0)->content;
$item['author'] = html_entity_decode($articleHtml->find('meta[property=og:title]', 0)->content);

$imageUrl = $articleHtml->find('meta[property=og:image]', 0);
if ($imageUrl) {
$item['enclosures'][] = html_entity_decode($imageUrl->content);
}

// todo: parse hashtags out of content for $item['categories']
// todo: try to scrape out a timestamp for $item['timestamp'], it's not in the meta tags

$this->items[] = $item;
}
}
}