Files
Simple-Wishlist/include/image_fetch.php

551 lines
21 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
declare(strict_types=1);
/**
* Robust Image Downloader (universal, CDN/Bot-freundlich)
* - URL/DNS-Checks (kein SSRF, keine privaten IPs)
* - optionale Host-Whitelist
* - UA-Rotation, realistische Headers + Client Hints
* - Referer: bevorzugt Einbettungs-Seite (page_url), sonst Origin
* - Cookie-Probe auf page_url (sammelt z. B. __cf_bm)
* - HTTP/2 -> 1.1 Fallback, IPv4 bevorzugt (optional)
* - Byte-Limit im Stream, Retries mit Backoff+Jitter
* - Kein Echo/Output: RETURNTRANSFER überall an
*
* Rückgabe-Array:
* [
* 'ok' => bool,
* 'tmp_path' => string|null,
* 'mime' => string|null,
* 'http_code' => int|null,
* 'curl_err' => string|null,
* 'final_url' => string|null,
* 'bytes' => int,
* 'error' => string|null,
* ]
*/
namespace WList\Net;
final class ImageFetch
{
private static array $UA_LIST = [
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64; rv:129.0) Gecko/20100101 Firefox/129.0',
];
private static array $RETRY_HTTP = [429, 500, 502, 503, 504, 520, 521, 522, 523, 524];
private static function retryCurlCodes(): array
{
$list = [
\defined('CURLE_OPERATION_TIMEDOUT') ? \constant('CURLE_OPERATION_TIMEDOUT') : null,
\defined('CURLE_COULDNT_RESOLVE_HOST') ? \constant('CURLE_COULDNT_RESOLVE_HOST') : null,
\defined('CURLE_COULDNT_CONNECT') ? \constant('CURLE_COULDNT_CONNECT') : null,
\defined('CURLE_RECV_ERROR') ? \constant('CURLE_RECV_ERROR') : null,
\defined('CURLE_SEND_ERROR') ? \constant('CURLE_SEND_ERROR') : null,
\defined('CURLE_GOT_NOTHING') ? \constant('CURLE_GOT_NOTHING') : null,
\defined('CURLE_HTTP2_STREAM') ? \constant('CURLE_HTTP2_STREAM') : null,
\defined('CURLE_HTTP2') ? \constant('CURLE_HTTP2') : null,
];
return array_values(array_filter($list, static fn($v) => $v !== null));
}
public static function download(string $url, array $opt = []): array
{
$defaults = [
'page_url' => null, // Einbettungs-Seite (Angebotslink) universell
'force_client_hints' => true, // sec-ch-ua etc.
'max_bytes' => 8_000_000,
'timeout' => 12,
'connect_timeout' => 5,
'max_redirects' => 5,
'retries' => 3,
'retry_backoff_ms' => 250,
'whitelist_hosts' => null, // ['ikea.com', ...] oder null
'ip_resolve_v4' => true,
'referer' => 'auto', // 'auto' | 'none' | 'custom'
'custom_referer' => null,
'user_agents' => null,
'log_prefix' => 'imgfetch',
'debug' => true,
'debug_peek_bytes' => 512,
'try_http_versions' => ['2', '1.1'],
'try_ip_resolve_combo' => ['v4', 'auto'],
// Optional: bei harten Blockern nicht failen, sondern externen URL akzeptieren
'failopen_hosts' => [], // z.B. ['ikea.com']
];
$cfg = array_replace($defaults, $opt);
// URL + Host
if (!self::isValidHttpUrl($url))
return self::fail(null, null, 0, 'Ungültige URL');
$p = parse_url($url);
$host = strtolower($p['host'] ?? '');
if (!$host)
return self::fail(null, null, 0, 'Ungültige URL (Host)');
// Whitelist
if (is_array($cfg['whitelist_hosts']) && $cfg['whitelist_hosts']) {
$ok = false;
foreach ($cfg['whitelist_hosts'] as $allowed) {
$allowed = strtolower($allowed);
if ($host === $allowed || str_ends_with($host, '.' . $allowed)) {
$ok = true;
break;
}
}
if (!$ok)
return self::fail(null, null, 0, 'Host nicht erlaubt');
}
// DNS → keine privaten IPs
if (!self::hostResolvesPublic($host))
return self::fail(null, null, 0, 'Host nicht öffentlich erreichbar');
$uaList = (is_array($cfg['user_agents']) && $cfg['user_agents']) ? $cfg['user_agents'] : self::$UA_LIST;
$originRef = self::originFromUrl($url);
$defaultReferer = match ($cfg['referer']) {
'none' => null,
'custom' => (string) $cfg['custom_referer'],
default => $originRef,
};
$headers = [
'Accept: image/avif,image/webp,image/*;q=0.8,*/*;q=0.5',
'Accept-Language: de-DE,de;q=0.9,en;q=0.8',
'Cache-Control: no-cache',
'Pragma: no-cache',
'Sec-Fetch-Dest: image',
'Sec-Fetch-Mode: no-cors',
'Sec-Fetch-Site: cross-site', // wird dynamisch ggf. same-origin
];
if (!empty($cfg['force_client_hints'])) {
$headers[] = 'sec-ch-ua: "Chromium";v="128", "Not=A?Brand";v="99"';
$headers[] = 'sec-ch-ua-mobile: ?0';
$headers[] = 'sec-ch-ua-platform: "Linux"';
}
// Cookie-Tools
$cookieJar = [];
$collectCookie = static function (array $respHeaders) use (&$cookieJar): void {
if (!isset($respHeaders['set-cookie']))
return;
foreach ($respHeaders['set-cookie'] as $line) {
$parts = explode(';', $line);
if (!$parts)
continue;
$nv = trim($parts[0]);
$eq = strpos($nv, '=');
if ($eq === false)
continue;
$name = substr($nv, 0, $eq);
$value = substr($nv, $eq + 1);
if ($name !== '' && $value !== '')
$cookieJar[$name] = $value;
}
};
$cookieHeader = static function (array $jar): ?string {
if (!$jar)
return null;
$buf = [];
foreach ($jar as $k => $v)
$buf[] = "$k=$v";
return implode('; ', $buf);
};
// page_url (Einbettungs-Seite)
$embeddingUrl = is_string($cfg['page_url']) ? $cfg['page_url'] : null;
$embeddingHost = null;
if ($embeddingUrl && self::isValidHttpUrl($embeddingUrl)) {
$ep = parse_url($embeddingUrl);
$embeddingHost = strtolower($ep['host'] ?? '');
}
// Sec-Fetch-Site dynamisch
$fetchSite = 'cross-site';
if (
$embeddingHost && ($embeddingHost === $host
|| str_ends_with($host, '.' . $embeddingHost)
|| str_ends_with($embeddingHost, '.' . $host))
) {
$fetchSite = 'same-origin';
}
foreach ($headers as $i => $h) {
if (stripos($h, 'Sec-Fetch-Site:') === 0)
$headers[$i] = 'Sec-Fetch-Site: ' . $fetchSite;
}
// Cookie-Probe auf page_url
if ($embeddingUrl) {
$probeHeaders = [];
$chp = @curl_init($embeddingUrl);
if ($chp) {
$uaProbe = (string) ($uaList[0] ?? 'Mozilla/5.0');
$probeOpts = [
CURLOPT_NOBODY => false, // GET (HEAD liefert oft kein Set-Cookie)
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 3,
CURLOPT_CONNECTTIMEOUT => (int) $cfg['connect_timeout'],
CURLOPT_TIMEOUT => (int) $cfg['timeout'],
CURLOPT_USERAGENT => $uaProbe,
CURLOPT_SSL_VERIFYPEER => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_HEADERFUNCTION => function ($ch, $hdr) use (&$probeHeaders) {
$line = trim($hdr);
if ($line !== '' && stripos($line, 'http/') !== 0) {
$pos = strpos($line, ':');
if ($pos !== false) {
$k = strtolower(trim(substr($line, 0, $pos)));
$v = trim(substr($line, $pos + 1));
$probeHeaders[$k][] = $v;
}
}
return strlen($hdr);
},
CURLOPT_WRITEFUNCTION => function ($ch, $d) {
return strlen($d); }, // nix ausgeben
CURLOPT_RETURNTRANSFER => true, // **wichtig**: niemals in Output schreiben
CURLOPT_ACCEPT_ENCODING => '',
];
$probeOpts[CURLOPT_REFERER] = $embeddingUrl;
@curl_setopt_array($chp, $probeOpts);
@curl_exec($chp); // Body wird verworfen, aber nicht ausgegeben
@curl_close($chp);
$collectCookie($probeHeaders);
}
}
// Versuchsstrategie
$attempts = max(1, (int) $cfg['retries']);
$lastHttp = null;
$lastCurlErr = null;
$finalUrl = null;
$mime = null;
$received = 0;
$httpVersSeq = (array) $cfg['try_http_versions'];
$ipSeq = (array) $cfg['try_ip_resolve_combo'];
for ($i = 0; $i < $attempts; $i++) {
$ua = $uaList[$i % count($uaList)];
$httpVers = $httpVersSeq[$i % count($httpVersSeq)];
$ipPref = $ipSeq[$i % count($ipSeq)];
$tmp = tempnam(sys_get_temp_dir(), 'wlimg_');
if ($tmp === false)
return self::fail(null, null, 0, 'Temp-Datei Fehler');
$fh = fopen($tmp, 'wb');
if ($fh === false) {
@unlink($tmp);
return self::fail(null, null, 0, 'Temp-Datei Fehler');
}
$respHeaders = [];
$headerFn = function ($ch, $hdr) use (&$respHeaders) {
$line = trim($hdr);
if ($line === '' || strpos($line, 'HTTP/') === 0) {
} else {
$pos = strpos($line, ':');
if ($pos !== false) {
$k = strtolower(trim(substr($line, 0, $pos)));
$v = trim(substr($line, $pos + 1));
$respHeaders[$k][] = $v;
}
}
return strlen($hdr);
};
$ch = curl_init($url);
if ($ch === false) {
fclose($fh);
@unlink($tmp);
return self::fail(null, null, 0, 'Download Fehler (init)');
}
$received = 0;
$peekBuf = '';
$peekLimit = max(0, (int) $cfg['debug_peek_bytes']);
$writeFn = function ($ch, $data) use (&$received, $cfg, $fh, &$peekBuf, $peekLimit) {
$len = strlen($data);
if ($peekLimit > 0 && strlen($peekBuf) < $peekLimit) {
$need = $peekLimit - strlen($peekBuf);
$peekBuf .= substr($data, 0, max(0, min($need, $len)));
}
$received += $len;
if ($received > (int) $cfg['max_bytes'])
return 0;
return fwrite($fh, $data);
};
$opts = [
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => (int) $cfg['max_redirects'],
CURLOPT_CONNECTTIMEOUT => (int) $cfg['connect_timeout'],
CURLOPT_TIMEOUT => (int) $cfg['timeout'],
CURLOPT_USERAGENT => $ua,
CURLOPT_SSL_VERIFYPEER => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_HEADERFUNCTION => $headerFn,
CURLOPT_WRITEFUNCTION => $writeFn,
CURLOPT_RETURNTRANSFER => true, // **wichtig**: niemals direkt ausgeben
CURLOPT_FILE => $fh,
CURLOPT_ACCEPT_ENCODING => '',
];
if ($httpVers === '2' && \defined('CURL_HTTP_VERSION_2_0')) {
$opts[CURLOPT_HTTP_VERSION] = \constant('CURL_HTTP_VERSION_2_0');
} elseif ($httpVers === '1.1' && \defined('CURL_HTTP_VERSION_1_1')) {
$opts[CURLOPT_HTTP_VERSION] = \constant('CURL_HTTP_VERSION_1_1');
}
if ($ipPref === 'v4' && \defined('CURLOPT_IPRESOLVE') && \defined('CURL_IPRESOLVE_V4')) {
$opts[CURLOPT_IPRESOLVE] = \constant('CURL_IPRESOLVE_V4');
}
// Referer
if ($embeddingUrl)
$opts[CURLOPT_REFERER] = $embeddingUrl;
elseif ($defaultReferer)
$opts[CURLOPT_REFERER] = $defaultReferer;
// Cookies
$cookieStr = $cookieHeader($cookieJar);
if ($cookieStr)
$opts[CURLOPT_COOKIE] = $cookieStr;
curl_setopt_array($ch, $opts);
// Wichtig: exec nie echoen lassen
@curl_exec($ch);
$http = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE);
$ctype = (string) (curl_getinfo($ch, CURLINFO_CONTENT_TYPE) ?: '');
$finalUrl = (string) (curl_getinfo($ch, CURLINFO_EFFECTIVE_URL) ?: $url);
$cerr = curl_errno($ch);
$cerrStr = curl_error($ch);
curl_close($ch);
fclose($fh);
$lastHttp = $http;
$lastCurlErr = $cerrStr;
// Debug
if ($cfg['debug']) {
$hdrLog = '';
foreach ($respHeaders as $k => $vals)
foreach ($vals as $v)
$hdrLog .= "$k: $v | ";
$hdrLog = rtrim($hdrLog, ' |');
$peekHex = bin2hex($peekBuf);
$peekTxt = preg_replace('/[^\x20-\x7E]/', '.', $peekBuf);
error_log(sprintf(
"%s try#%d http=%d cerr=%d:%s ua='%s' ref='%s' vers=%s ip=%s eff='%s' ctype='%s' bytes=%d hdrs={%s} peek_hex=%s peek_txt=%s url=%s",
$cfg['log_prefix'],
$i + 1,
$http,
$cerr,
$cerrStr ?: '-',
$ua,
($embeddingUrl ?: ($defaultReferer ?: '-')),
$httpVers,
$ipPref,
$finalUrl,
$ctype,
$received,
$hdrLog,
substr($peekHex, 0, 160),
substr($peekTxt, 0, 160),
$url
));
}
// Größenlimit
if ($cerr === (\defined('CURLE_WRITE_ERROR') ? \constant('CURLE_WRITE_ERROR') : 23) && $received > (int) $cfg['max_bytes']) {
@unlink($tmp);
error_log("{$cfg['log_prefix']} size limit hit after {$received} bytes url=$url");
return self::fail(null, null, 413, 'Bild zu groß');
}
// Erfolg
if ($http >= 200 && $http < 300) {
if (stripos($ctype, 'image/') === 0) {
$mime = $ctype;
} else {
$probe = @getimagesize($tmp);
if ($probe !== false)
$mime = $probe['mime'] ?? 'image/*';
}
if ($mime) {
return [
'ok' => true,
'tmp_path' => $tmp,
'mime' => $mime,
'http_code' => $http,
'curl_err' => null,
'final_url' => $finalUrl,
'bytes' => $received,
'error' => null
];
}
@unlink($tmp);
} else {
@unlink($tmp);
// Cookies aus Antwort sammeln
$collectCookie($respHeaders);
// Cloudflare-Heuristik
$cf403 = ($http === 403) && (
(isset($respHeaders['server']) && stripos(implode(',', $respHeaders['server']), 'cloudflare') !== false)
|| isset($respHeaders['set-cookie'])
);
$doRetry = ($i + 1) < $attempts && (
in_array($http, self::$RETRY_HTTP, true) ||
in_array($cerr, self::retryCurlCodes(), true) ||
$http === 0 || $cf403
);
if (!$doRetry)
break;
$sleepMs = (int) ($cfg['retry_backoff_ms'] * (2 ** $i) + random_int(0, 150));
usleep($sleepMs * 1000);
continue;
}
}
// Fail-open? (z. B. Ikea hart geblockt)
foreach ((array) $cfg['failopen_hosts'] as $fo) {
$fo = strtolower($fo);
if ($host === $fo || str_ends_with($host, '.' . $fo)) {
return [
'ok' => false,
'tmp_path' => null,
'mime' => null,
'http_code' => $lastHttp ?? 403,
'curl_err' => $lastCurlErr,
'final_url' => $url,
'bytes' => 0,
'error' => 'failopen',
];
}
}
return self::fail(null, $lastCurlErr, $lastHttp ?? 0, 'Bild-Download fehlgeschlagen');
}
public static function safeFileNameFromUrl(string $url): string
{
$stripped = strtok($url, '?#');
$ext = strtolower(pathinfo((string) $stripped, PATHINFO_EXTENSION));
if (!preg_match('/^[a-z0-9]{1,5}$/i', $ext))
$ext = 'jpg';
return bin2hex(random_bytes(10)) . '.' . $ext;
}
private static function isValidHttpUrl(string $url): bool
{
if (!filter_var($url, FILTER_VALIDATE_URL))
return false;
$p = parse_url($url);
if (!$p || empty($p['scheme']) || empty($p['host']))
return false;
$s = strtolower($p['scheme']);
return $s === 'http' || $s === 'https';
}
private static function originFromUrl(string $url): string
{
$p = parse_url($url);
if (!$p || empty($p['scheme']) || empty($p['host']))
return '';
$port = '';
if (!empty($p['port'])) {
$def = $p['scheme'] === 'https' ? 443 : 80;
if ((int) $p['port'] !== $def)
$port = ':' . $p['port'];
}
return $p['scheme'] . '://' . $p['host'] . $port . '/';
}
private static function hostResolvesPublic(string $host): bool
{
$recs = @dns_get_record($host, DNS_A + DNS_AAAA);
if (!$recs || !count($recs))
return false;
foreach ($recs as $r) {
$ip = $r['type'] === 'A' ? ($r['ip'] ?? null) : ($r['ipv6'] ?? null);
if (!$ip)
continue;
if (self::isPrivateIp($ip))
return false;
}
return true;
}
private static function isPrivateIp(string $ip): bool
{
if (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) {
$cidrs = ['10.0.0.0/8', '172.16.0.0/12', '192.168.0.0/16', '127.0.0.0/8', '169.254.0.0/16'];
} elseif (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) {
$cidrs = ['::1/128', 'fc00::/7', 'fe80::/10'];
} else
return true;
foreach ($cidrs as $c)
if (self::ipInCidr($ip, $c))
return true;
return false;
}
private static function ipInCidr(string $ip, string $cidr): bool
{
if (strpos($cidr, ':') !== false) {
[$subnet, $mask] = array_pad(explode('/', $cidr, 2), 2, null);
$mask = (int) $mask;
$binIp = inet_pton($ip);
$binSubnet = inet_pton($subnet);
if ($binIp === false || $binSubnet === false)
return false;
$bytes = intdiv($mask, 8);
$bits = $mask % 8;
if ($bytes && substr($binIp, 0, $bytes) !== substr($binSubnet, 0, $bytes))
return false;
if ($bits) {
$b1 = ord($binIp[$bytes]) & (0xFF << (8 - $bits));
$b2 = ord($binSubnet[$bytes]) & (0xFF << (8 - $bits));
return $b1 === $b2;
}
return true;
} else {
[$subnet, $mask] = array_pad(explode('/', $cidr, 2), 2, null);
$mask = (int) $mask;
$ipL = ip2long($ip);
$subL = ip2long($subnet);
if ($ipL === false || $subL === false)
return false;
$maskL = -1 << (32 - $mask);
return (($ipL & $maskL) === ($subL & $maskL));
}
}
private static function fail(?string $tmp, ?string $cerr, int $http, string $msg): array
{
if ($tmp && is_file($tmp))
@unlink($tmp);
return [
'ok' => false,
'tmp_path' => null,
'mime' => null,
'http_code' => $http > 0 ? $http : null,
'curl_err' => $cerr,
'final_url' => null,
'bytes' => 0,
'error' => $msg,
];
}
}