reworked image-fetching

This commit is contained in:
2025-09-15 08:13:23 +02:00
parent 63898b6391
commit d38123ce67
2 changed files with 391 additions and 147 deletions

357
include/image_fetch.php Normal file
View File

@@ -0,0 +1,357 @@
<?php
declare(strict_types=1);
/**
* Robust Image Downloader (CDN/Anti-Bot freundlich)
* - Validiert URL + DNS (keine private IPs/SSRF)
* - Host-Whitelist optional
* - Rotiert mehrere echte Browser-UAs
* - Setzt realistische Headers (Accept: AVIF/WebP etc.)
* - Referer automatisch auf Origin
* - IPv4 bevorzugt
* - Follow Redirects
* - Strict Content-Type-Check (image/*)
* - Byte-Limit-Abbruch im Stream
* - Retries mit Exponential Backoff + Jitter
* - Liefert Temp-Datei; Caller verschiebt/benennt final
*
* Rückgabe-Array:
* [
* 'ok' => bool,
* 'tmp_path' => string|null,
* 'mime' => string|null,
* 'http_code' => int|null,
* 'curl_err' => string|null,
* 'final_url' => string|null,
* 'bytes' => int,
* ]
*/
namespace WList\Net;
final class ImageFetch
{
/** Default User-Agents (rotieren pro Versuch) */
private static array $UA_LIST = [
// Aktuelle Desktop-Chromes/Firefox als Tarnkappe
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64; rv:129.0) Gecko/20100101 Firefox/129.0',
];
/** Fehlercodes, bei denen sich ein Retry lohnt */
private static array $RETRY_HTTP = [429, 500, 502, 503, 504, 520, 521, 522, 523, 524];
private static array $RETRY_CURL = [
CURLE_OPERATION_TIMEDOUT,
CURLE_COULDNT_RESOLVE_HOST,
CURLE_COULDNT_CONNECT,
CURLE_RECV_ERROR,
CURLE_SEND_ERROR,
CURLE_GOT_NOTHING,
CURLE_HTTP2_STREAM, // HTTP/2 stream error/RESET
];
/** Öffentliche API */
public static function download(string $url, array $opt = []): array
{
$defaults = [
'max_bytes' => 8_000_000, // 8 MiB
'timeout' => 12, // Sek.
'connect_timeout' => 5, // Sek.
'max_redirects' => 5,
'retries' => 3,
'retry_backoff_ms' => 250, // Basis-Backoff
'whitelist_hosts' => null, // ['ikea.com','images.ikea.com'] oder null
'ip_resolve_v4' => true,
'referer' => 'auto', // 'auto' | 'none' | 'custom'
'custom_referer' => null,
'user_agents' => null, // override UA-Liste
'log_prefix' => 'imgfetch', // für error_log
];
$cfg = array_replace($defaults, $opt);
// 1) URL validieren + Host prüfen
if (!self::isValidHttpUrl($url)) {
return self::fail(null, null, 0, 'Ungültige URL');
}
$p = parse_url($url);
$host = strtolower($p['host'] ?? '');
if (!$host) {
return self::fail(null, null, 0, 'Ungültige URL (Host)');
}
// Host-Whitelist (optional)
if (is_array($cfg['whitelist_hosts']) && count($cfg['whitelist_hosts']) > 0) {
$ok = false;
foreach ($cfg['whitelist_hosts'] as $allowed) {
$allowed = strtolower($allowed);
if ($host === $allowed || str_ends_with($host, '.'.$allowed)) {
$ok = true; break;
}
}
if (!$ok) {
return self::fail(null, null, 0, 'Host nicht erlaubt');
}
}
// DNS → keine privaten IPs
if (!self::hostResolvesPublic($host)) {
return self::fail(null, null, 0, 'Host nicht öffentlich erreichbar');
}
// 2) Tmpfile anlegen
$tmp = tempnam(sys_get_temp_dir(), 'wlimg_');
if ($tmp === false) {
return self::fail(null, null, 0, 'Temp-Datei Fehler');
}
// 3) Vorbereitung: Header + Referer + UAs
$uaList = is_array($cfg['user_agents']) && $cfg['user_agents'] ? $cfg['user_agents'] : self::$UA_LIST;
$originRef = self::originFromUrl($url);
$referer = match ($cfg['referer']) {
'none' => null,
'custom'=> (string)$cfg['custom_referer'],
default => $originRef, // auto
};
$headers = [
'Accept: image/avif,image/webp,image/*;q=0.8,*/*;q=0.5',
'Accept-Language: de-DE,de;q=0.9,en;q=0.8',
'Cache-Control: no-cache',
'Pragma: no-cache',
// Friendly fetch hints (einige CDNs schauen da drauf)
'Sec-Fetch-Dest: image',
'Sec-Fetch-Mode: no-cors',
'Sec-Fetch-Site: cross-site',
];
// 4) Retries
$attempts = max(1, (int)$cfg['retries']);
$received = 0;
$lastHttp = null;
$lastCurlErr = null;
$finalUrl = null;
$mime = null;
$ok = false;
for ($i = 0; $i < $attempts; $i++) {
$ua = $uaList[$i % count($uaList)];
$fh = fopen($tmp, 'wb');
if ($fh === false) {
return self::fail($tmp, null, 0, 'Temp-Datei Fehler');
}
$ch = curl_init($url);
if ($ch === false) {
fclose($fh);
return self::fail($tmp, null, 0, 'Download Fehler (init)');
}
$received = 0;
$opts = [
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => (int)$cfg['max_redirects'],
CURLOPT_CONNECTTIMEOUT => (int)$cfg['connect_timeout'],
CURLOPT_TIMEOUT => (int)$cfg['timeout'],
CURLOPT_USERAGENT => $ua,
CURLOPT_SSL_VERIFYPEER => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_HEADER => false,
CURLOPT_RETURNTRANSFER => false, // stream direkt ins Filehandle
CURLOPT_FILE => $fh, // fallback, falls WRITEFUNCTION nicht greift
CURLOPT_WRITEFUNCTION => function ($ch, $data) use (&$received, $cfg, $fh) {
$len = strlen($data);
$received += $len;
if ($received > (int)$cfg['max_bytes']) {
return 0; // -> CURLE_WRITE_ERROR
}
return fwrite($fh, $data);
},
CURLOPT_ACCEPT_ENCODING => '', // gzip/br zulassen
];
if ($cfg['ip_resolve_v4']) {
$opts[CURLOPT_IPRESOLVE] = CURL_IPRESOLVE_V4;
}
if ($referer) {
$opts[CURLOPT_REFERER] = $referer;
}
curl_setopt_array($ch, $opts);
$exec = curl_exec($ch);
$http = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE);
$ctype = (string) (curl_getinfo($ch, CURLINFO_CONTENT_TYPE) ?: '');
$finalUrl = (string) (curl_getinfo($ch, CURLINFO_EFFECTIVE_URL) ?: $url);
$cerr = curl_errno($ch);
$cerrStr = curl_error($ch);
curl_close($ch);
fclose($fh);
$lastHttp = $http;
$lastCurlErr = $cerrStr;
// Abbruch durch Größenlimit → als 413 semantisch behandeln (Payload Too Large)
if ($exec === false && $cerr === CURLE_WRITE_ERROR && $received > (int)$cfg['max_bytes']) {
@unlink($tmp);
error_log("{$cfg['log_prefix']} size limit hit after {$received} bytes url=$url");
return self::fail(null, null, 413, 'Bild zu groß');
}
// Erfolgspfad: 2xx + image/*
if ($exec !== false && $http >= 200 && $http < 300 && stripos($ctype, 'image/') === 0) {
$mime = $ctype;
$ok = true;
break;
}
// Nicht-Image trotz 2xx → Blockseite/HTML/CSS etc.
if ($exec !== false && $http >= 200 && $http < 300 && stripos($ctype, 'image/') !== 0) {
// Manche Server liefern leeren/fehlenden Content-Type. Letzter Rettungsanker: magic sniff via getimagesize
$probe = @getimagesize($tmp);
if ($probe !== false) {
$mime = $probe['mime'] ?? 'image/*';
$ok = true;
break;
}
// sonst retry
@unlink($tmp);
$doRetry = ($i + 1) < $attempts;
error_log("{$cfg['log_prefix']} bad ctype http=$http ctype={$ctype} retry=".($doRetry?'1':'0')." url=$url");
} else {
// Fehler oder Non-2xx → ggf. retry
@unlink($tmp);
$doRetry = ($i + 1) < $attempts &&
(in_array($http, self::$RETRY_HTTP, true) || in_array($cerr, self::$RETRY_CURL, true) || $http === 0);
error_log("{$cfg['log_prefix']} fail http=$http curl={$cerr}:{$cerrStr} ua#{$i} retry=".($doRetry?'1':'0')." url=$url");
}
// Backoff + Jitter vorm nächsten Versuch
if (($i + 1) < $attempts) {
$sleepMs = (int)($cfg['retry_backoff_ms'] * (2 ** $i) + random_int(0, 150));
usleep($sleepMs * 1000);
}
// neues Tmp für nächsten Versuch
$tmp = tempnam(sys_get_temp_dir(), 'wlimg_');
if ($tmp === false) {
return self::fail(null, null, 0, 'Temp-Datei Fehler');
}
}
if (!$ok) {
return self::fail(null, $lastCurlErr, $lastHttp ?? 0, 'Bild-Download fehlgeschlagen');
}
return [
'ok' => true,
'tmp_path' => $tmp,
'mime' => $mime,
'http_code' => $lastHttp ?? 200,
'curl_err' => null,
'final_url' => $finalUrl,
'bytes' => $received,
];
}
/** Hilfsfunktionen */
public static function safeFileNameFromUrl(string $url): string
{
$stripped = strtok($url, '?#');
$ext = strtolower(pathinfo((string)$stripped, PATHINFO_EXTENSION));
if (!preg_match('/^[a-z0-9]{1,5}$/i', $ext)) $ext = 'jpg';
return bin2hex(random_bytes(10)).'.'.$ext;
}
private static function isValidHttpUrl(string $url): bool
{
if (!filter_var($url, FILTER_VALIDATE_URL)) return false;
$p = parse_url($url);
if (!$p || empty($p['scheme']) || empty($p['host'])) return false;
$s = strtolower($p['scheme']);
return $s === 'http' || $s === 'https';
}
private static function originFromUrl(string $url): string
{
$p = parse_url($url);
if (!$p || empty($p['scheme']) || empty($p['host'])) return '';
$port = '';
if (!empty($p['port'])) {
$default = ($p['scheme'] === 'https') ? 443 : 80;
if ((int)$p['port'] !== $default) $port = ':'.$p['port'];
}
return $p['scheme'].'://'.$p['host'].$port.'/';
}
private static function hostResolvesPublic(string $host): bool
{
$recs = @dns_get_record($host, DNS_A + DNS_AAAA);
if (!$recs || !count($recs)) return false;
foreach ($recs as $r) {
$ip = $r['type'] === 'A' ? ($r['ip'] ?? null) : ($r['ipv6'] ?? null);
if (!$ip) continue;
if (self::isPrivateIp($ip)) return false;
}
return true;
}
private static function isPrivateIp(string $ip): bool
{
if (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) {
$cidrs = ['10.0.0.0/8','172.16.0.0/12','192.168.0.0/16','127.0.0.0/8','169.254.0.0/16'];
} elseif (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) {
$cidrs = ['::1/128','fc00::/7','fe80::/10'];
} else {
return true;
}
foreach ($cidrs as $c) if (self::ipInCidr($ip, $c)) return true;
return false;
}
private static function ipInCidr(string $ip, string $cidr): bool
{
if (strpos($cidr, ':') !== false) {
[$subnet, $mask] = array_pad(explode('/', $cidr, 2), 2, null);
$mask = (int)$mask;
$binIp = inet_pton($ip);
$binSubnet = inet_pton($subnet);
if ($binIp === false || $binSubnet === false) return false;
$bytes = intdiv($mask, 8);
$bits = $mask % 8;
if ($bytes && substr($binIp, 0, $bytes) !== substr($binSubnet, 0, $bytes)) return false;
if ($bits) {
$b1 = ord($binIp[$bytes]) & (0xFF << (8 - $bits));
$b2 = ord($binSubnet[$bytes]) & (0xFF << (8 - $bits));
return $b1 === $b2;
}
return true;
} else {
[$subnet, $mask] = array_pad(explode('/', $cidr, 2), 2, null);
$mask = (int)$mask;
$ipL = ip2long($ip);
$subL = ip2long($subnet);
if ($ipL === false || $subL === false) return false;
$maskL = -1 << (32 - $mask);
return (($ipL & $maskL) === ($subL & $maskL));
}
}
private static function fail(?string $tmp, ?string $cerr, int $http, string $msg): array
{
if ($tmp && is_file($tmp)) @unlink($tmp);
return [
'ok' => false,
'tmp_path' => null,
'mime' => null,
'http_code' => $http > 0 ? $http : null,
'curl_err' => $cerr,
'final_url' => null,
'bytes' => 0,
'error' => $msg,
];
}
}

181
item.php
View File

@@ -1,6 +1,10 @@
<?php
declare(strict_types=1);
require_once __DIR__ . '/include/image_fetch.php';
use WList\Net\ImageFetch;
/* ========= Session & Bootstrap ========= */
$secure = (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] !== 'off');
session_set_cookie_params([
@@ -37,9 +41,7 @@ if (!empty($app_debug)) {
error_reporting(E_ALL);
}
/* ============= Helpers ============= */
function fail(string $msg = 'Unerwarteter Fehler', int $code = 400): void
{
http_response_code($code);
@@ -90,133 +92,6 @@ function is_valid_http_url(string $url): bool
$s = strtolower($p['scheme']);
return $s === 'http' || $s === 'https';
}
function ip_in_cidr(string $ip, string $cidr): bool
{
if (strpos($cidr, ':') !== false) {
[$subnet, $mask] = array_pad(explode('/', $cidr, 2), 2, null);
$mask = (int) $mask;
$binIp = inet_pton($ip);
$binSubnet = inet_pton($subnet);
if ($binIp === false || $binSubnet === false)
return false;
$bytes = intdiv($mask, 8);
$bits = $mask % 8;
if ($bytes && substr($binIp, 0, $bytes) !== substr($binSubnet, 0, $bytes))
return false;
if ($bits) {
$b1 = ord($binIp[$bytes]) & (0xFF << (8 - $bits));
$b2 = ord($binSubnet[$bytes]) & (0xFF << (8 - $bits));
return $b1 === $b2;
}
return true;
} else {
[$subnet, $mask] = array_pad(explode('/', $cidr, 2), 2, null);
$mask = (int) $mask;
$ipL = ip2long($ip);
$subL = ip2long($subnet);
if ($ipL === false || $subL === false)
return false;
$maskL = -1 << (32 - $mask);
return (($ipL & $maskL) === ($subL & $maskL));
}
}
function is_private_ip(string $ip): bool
{
if (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) {
foreach (['10.0.0.0/8', '172.16.0.0/12', '192.168.0.0/16', '127.0.0.0/8', '169.254.0.0/16'] as $c)
if (ip_in_cidr($ip, $c))
return true;
} elseif (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) {
foreach (['::1/128', 'fc00::/7', 'fe80::/10'] as $c)
if (ip_in_cidr($ip, $c))
return true;
}
return false;
}
function validate_remote_host_not_private(string $url): void
{
$p = parse_url($url);
if (!$p || empty($p['host']))
fail('Ungültige URL', 400);
$host = $p['host'];
global $image_host_whitelist;
if (isset($image_host_whitelist) && is_array($image_host_whitelist) && count($image_host_whitelist) > 0) {
$ok = false;
foreach ($image_host_whitelist as $allowed) {
if (strcasecmp($host, $allowed) === 0) {
$ok = true;
break;
}
if (preg_match('/\.' . preg_quote($allowed, '/') . '$/i', $host)) {
$ok = true;
break;
}
}
if (!$ok)
fail('Host nicht erlaubt', 400);
}
$recs = dns_get_record($host, DNS_A + DNS_AAAA);
if (!$recs || !count($recs))
fail('Host nicht auflösbar', 400);
foreach ($recs as $r) {
$ip = $r['type'] === 'A' ? ($r['ip'] ?? null) : ($r['ipv6'] ?? null);
if (!$ip)
continue;
if (is_private_ip($ip))
fail('Zieladresse unzulässig', 400);
}
}
function download_remote_image_limited(string $url, int $maxBytes = 5_000_000, int $timeout = 8): string
{
$tmp = tempnam(sys_get_temp_dir(), 'wlimg_');
if ($tmp === false)
fail('Temp-Datei Fehler', 500);
$fh = fopen($tmp, 'wb');
if ($fh === false) {
@unlink($tmp);
fail('Temp-Datei Fehler', 500);
}
$ch = curl_init($url);
if ($ch === false) {
fclose($fh);
@unlink($tmp);
fail('Download Fehler', 500);
}
$received = 0;
curl_setopt_array($ch, [
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 3,
CURLOPT_CONNECTTIMEOUT => 3,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_USERAGENT => 'wishlist/1.0',
CURLOPT_SSL_VERIFYPEER => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_WRITEFUNCTION => function ($ch, $data) use (&$received, $maxBytes, $fh) {
$len = strlen($data);
$received += $len;
if ($received > $maxBytes)
return 0;
return fwrite($fh, $data);
}
]);
$ok = curl_exec($ch);
$code = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
fclose($fh);
if (!$ok || $code < 200 || $code >= 300) {
@unlink($tmp);
fail('Bild-Download fehlgeschlagen', 400);
}
return $tmp;
}
function safe_image_filename_from_url(string $url): string
{
$stripped = strtok($url, '?#');
$ext = strtolower(pathinfo((string) $stripped, PATHINFO_EXTENSION));
if (!preg_match('/^[a-z0-9]{1,5}$/i', $ext))
$ext = 'jpg';
return bin2hex(random_bytes(10)) . '.' . $ext;
}
/* ============= Controller ============= */
@@ -269,38 +144,50 @@ if ($ItemLink !== '' && !is_valid_http_url($ItemLink))
/* Optional: Bild von externer URL holen */
$imageLocalLink = null;
if (!$removeImage && $ItemImageUrl !== '') {
if (!is_valid_http_url($ItemImageUrl)) {
$whitelist = $image_host_whitelist ?? null;
$fetch = ImageFetch::download($ItemImageUrl, [
'max_bytes' => 8_000_000,
'timeout' => 12,
'connect_timeout' => 5,
'retries' => 4,
'retry_backoff_ms' => 300,
'whitelist_hosts' => $whitelist,
'ip_resolve_v4' => true,
'referer' => 'auto',
'log_prefix' => 'wishlist-img',
]);
if (!$fetch['ok']) {
error_log("wishlist image error: http=" . ($fetch['http_code'] ?? 0) . " curl=" . ($fetch['curl_err'] ?? '-') . " url=$ItemImageUrl");
$conn->close();
fail('Ungültiger Bildlink', 400);
fail('Bild-Download fehlgeschlagen', 400);
}
validate_remote_host_not_private($ItemImageUrl);
$tmp = download_remote_image_limited($ItemImageUrl, 5_000_000, 8);
$info = @getimagesize($tmp);
if ($info === false || empty($info['mime']) || stripos($info['mime'], 'image/') !== 0) {
@unlink($tmp);
$info = @getimagesize($fetch['tmp_path']);
$mime = $info['mime'] ?? $fetch['mime'] ?? 'image/*';
if (stripos($mime, 'image/') !== 0) {
@unlink($fetch['tmp_path']);
$conn->close();
fail('Link ist kein gültiges Bild', 400);
}
global $imagedir;
if (!is_dir($imagedir)) {
if (!is_dir($imagedir))
@mkdir($imagedir, 0755, true);
}
$filename = safe_image_filename_from_url($ItemImageUrl);
$filename = ImageFetch::safeFileNameFromUrl($ItemImageUrl);
$target = rtrim($imagedir, '/') . '/' . $filename;
if (!@rename($tmp, $target)) {
// Fallback falls rename scheitert
if (!@copy($tmp, $target)) {
@unlink($tmp);
if (!@rename($fetch['tmp_path'], $target)) {
if (!@copy($fetch['tmp_path'], $target)) {
@unlink($fetch['tmp_path']);
$conn->close();
fail('Bildspeicherung fehlgeschlagen', 500);
}
@unlink($tmp);
@unlink($fetch['tmp_path']);
}
// HIER: Permissions fixen
@chmod($target, 0644);
$imageLocalLink = $filename;
}