From d38123ce6709b8eece4e3b104ef7b052d2630d23 Mon Sep 17 00:00:00 2001 From: Marcel Peterkau Date: Mon, 15 Sep 2025 08:13:23 +0200 Subject: [PATCH] reworked image-fetching --- include/image_fetch.php | 357 ++++++++++++++++++++++++++++++++++++++++ item.php | 181 ++++---------------- 2 files changed, 391 insertions(+), 147 deletions(-) create mode 100644 include/image_fetch.php diff --git a/include/image_fetch.php b/include/image_fetch.php new file mode 100644 index 0000000..efc79a5 --- /dev/null +++ b/include/image_fetch.php @@ -0,0 +1,357 @@ + bool, + * 'tmp_path' => string|null, + * 'mime' => string|null, + * 'http_code' => int|null, + * 'curl_err' => string|null, + * 'final_url' => string|null, + * 'bytes' => int, + * ] + */ + +namespace WList\Net; + +final class ImageFetch +{ + /** Default User-Agents (rotieren pro Versuch) */ + private static array $UA_LIST = [ + // Aktuelle Desktop-Chromes/Firefox als Tarnkappe + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15', + 'Mozilla/5.0 (X11; Linux x86_64; rv:129.0) Gecko/20100101 Firefox/129.0', + ]; + + /** Fehlercodes, bei denen sich ein Retry lohnt */ + private static array $RETRY_HTTP = [429, 500, 502, 503, 504, 520, 521, 522, 523, 524]; + private static array $RETRY_CURL = [ + CURLE_OPERATION_TIMEDOUT, + CURLE_COULDNT_RESOLVE_HOST, + CURLE_COULDNT_CONNECT, + CURLE_RECV_ERROR, + CURLE_SEND_ERROR, + CURLE_GOT_NOTHING, + CURLE_HTTP2_STREAM, // HTTP/2 stream error/RESET + ]; + + /** Öffentliche API */ + public static function download(string $url, array $opt = []): array + { + $defaults = [ + 'max_bytes' => 8_000_000, // 8 MiB + 'timeout' => 12, // Sek. + 'connect_timeout' => 5, // Sek. + 'max_redirects' => 5, + 'retries' => 3, + 'retry_backoff_ms' => 250, // Basis-Backoff + 'whitelist_hosts' => null, // ['ikea.com','images.ikea.com'] oder null + 'ip_resolve_v4' => true, + 'referer' => 'auto', // 'auto' | 'none' | 'custom' + 'custom_referer' => null, + 'user_agents' => null, // override UA-Liste + 'log_prefix' => 'imgfetch', // für error_log + ]; + $cfg = array_replace($defaults, $opt); + + // 1) URL validieren + Host prüfen + if (!self::isValidHttpUrl($url)) { + return self::fail(null, null, 0, 'Ungültige URL'); + } + $p = parse_url($url); + $host = strtolower($p['host'] ?? ''); + if (!$host) { + return self::fail(null, null, 0, 'Ungültige URL (Host)'); + } + + // Host-Whitelist (optional) + if (is_array($cfg['whitelist_hosts']) && count($cfg['whitelist_hosts']) > 0) { + $ok = false; + foreach ($cfg['whitelist_hosts'] as $allowed) { + $allowed = strtolower($allowed); + if ($host === $allowed || str_ends_with($host, '.'.$allowed)) { + $ok = true; break; + } + } + if (!$ok) { + return self::fail(null, null, 0, 'Host nicht erlaubt'); + } + } + + // DNS → keine privaten IPs + if (!self::hostResolvesPublic($host)) { + return self::fail(null, null, 0, 'Host nicht öffentlich erreichbar'); + } + + // 2) Tmpfile anlegen + $tmp = tempnam(sys_get_temp_dir(), 'wlimg_'); + if ($tmp === false) { + return self::fail(null, null, 0, 'Temp-Datei Fehler'); + } + + // 3) Vorbereitung: Header + Referer + UAs + $uaList = is_array($cfg['user_agents']) && $cfg['user_agents'] ? $cfg['user_agents'] : self::$UA_LIST; + $originRef = self::originFromUrl($url); + $referer = match ($cfg['referer']) { + 'none' => null, + 'custom'=> (string)$cfg['custom_referer'], + default => $originRef, // auto + }; + + $headers = [ + 'Accept: image/avif,image/webp,image/*;q=0.8,*/*;q=0.5', + 'Accept-Language: de-DE,de;q=0.9,en;q=0.8', + 'Cache-Control: no-cache', + 'Pragma: no-cache', + // Friendly fetch hints (einige CDNs schauen da drauf) + 'Sec-Fetch-Dest: image', + 'Sec-Fetch-Mode: no-cors', + 'Sec-Fetch-Site: cross-site', + ]; + + // 4) Retries + $attempts = max(1, (int)$cfg['retries']); + $received = 0; + $lastHttp = null; + $lastCurlErr = null; + $finalUrl = null; + $mime = null; + $ok = false; + + for ($i = 0; $i < $attempts; $i++) { + $ua = $uaList[$i % count($uaList)]; + + $fh = fopen($tmp, 'wb'); + if ($fh === false) { + return self::fail($tmp, null, 0, 'Temp-Datei Fehler'); + } + + $ch = curl_init($url); + if ($ch === false) { + fclose($fh); + return self::fail($tmp, null, 0, 'Download Fehler (init)'); + } + + $received = 0; + $opts = [ + CURLOPT_FOLLOWLOCATION => true, + CURLOPT_MAXREDIRS => (int)$cfg['max_redirects'], + CURLOPT_CONNECTTIMEOUT => (int)$cfg['connect_timeout'], + CURLOPT_TIMEOUT => (int)$cfg['timeout'], + CURLOPT_USERAGENT => $ua, + CURLOPT_SSL_VERIFYPEER => true, + CURLOPT_SSL_VERIFYHOST => 2, + CURLOPT_HTTPHEADER => $headers, + CURLOPT_HEADER => false, + CURLOPT_RETURNTRANSFER => false, // stream direkt ins Filehandle + CURLOPT_FILE => $fh, // fallback, falls WRITEFUNCTION nicht greift + CURLOPT_WRITEFUNCTION => function ($ch, $data) use (&$received, $cfg, $fh) { + $len = strlen($data); + $received += $len; + if ($received > (int)$cfg['max_bytes']) { + return 0; // -> CURLE_WRITE_ERROR + } + return fwrite($fh, $data); + }, + CURLOPT_ACCEPT_ENCODING => '', // gzip/br zulassen + ]; + if ($cfg['ip_resolve_v4']) { + $opts[CURLOPT_IPRESOLVE] = CURL_IPRESOLVE_V4; + } + if ($referer) { + $opts[CURLOPT_REFERER] = $referer; + } + + curl_setopt_array($ch, $opts); + + $exec = curl_exec($ch); + $http = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE); + $ctype = (string) (curl_getinfo($ch, CURLINFO_CONTENT_TYPE) ?: ''); + $finalUrl = (string) (curl_getinfo($ch, CURLINFO_EFFECTIVE_URL) ?: $url); + $cerr = curl_errno($ch); + $cerrStr = curl_error($ch); + curl_close($ch); + fclose($fh); + + $lastHttp = $http; + $lastCurlErr = $cerrStr; + + // Abbruch durch Größenlimit → als 413 semantisch behandeln (Payload Too Large) + if ($exec === false && $cerr === CURLE_WRITE_ERROR && $received > (int)$cfg['max_bytes']) { + @unlink($tmp); + error_log("{$cfg['log_prefix']} size limit hit after {$received} bytes url=$url"); + return self::fail(null, null, 413, 'Bild zu groß'); + } + + // Erfolgspfad: 2xx + image/* + if ($exec !== false && $http >= 200 && $http < 300 && stripos($ctype, 'image/') === 0) { + $mime = $ctype; + $ok = true; + break; + } + + // Nicht-Image trotz 2xx → Blockseite/HTML/CSS etc. + if ($exec !== false && $http >= 200 && $http < 300 && stripos($ctype, 'image/') !== 0) { + // Manche Server liefern leeren/fehlenden Content-Type. Letzter Rettungsanker: magic sniff via getimagesize + $probe = @getimagesize($tmp); + if ($probe !== false) { + $mime = $probe['mime'] ?? 'image/*'; + $ok = true; + break; + } + // sonst retry + @unlink($tmp); + $doRetry = ($i + 1) < $attempts; + error_log("{$cfg['log_prefix']} bad ctype http=$http ctype={$ctype} retry=".($doRetry?'1':'0')." url=$url"); + } else { + // Fehler oder Non-2xx → ggf. retry + @unlink($tmp); + $doRetry = ($i + 1) < $attempts && + (in_array($http, self::$RETRY_HTTP, true) || in_array($cerr, self::$RETRY_CURL, true) || $http === 0); + error_log("{$cfg['log_prefix']} fail http=$http curl={$cerr}:{$cerrStr} ua#{$i} retry=".($doRetry?'1':'0')." url=$url"); + } + + // Backoff + Jitter vorm nächsten Versuch + if (($i + 1) < $attempts) { + $sleepMs = (int)($cfg['retry_backoff_ms'] * (2 ** $i) + random_int(0, 150)); + usleep($sleepMs * 1000); + } + + // neues Tmp für nächsten Versuch + $tmp = tempnam(sys_get_temp_dir(), 'wlimg_'); + if ($tmp === false) { + return self::fail(null, null, 0, 'Temp-Datei Fehler'); + } + } + + if (!$ok) { + return self::fail(null, $lastCurlErr, $lastHttp ?? 0, 'Bild-Download fehlgeschlagen'); + } + + return [ + 'ok' => true, + 'tmp_path' => $tmp, + 'mime' => $mime, + 'http_code' => $lastHttp ?? 200, + 'curl_err' => null, + 'final_url' => $finalUrl, + 'bytes' => $received, + ]; + } + + /** Hilfsfunktionen */ + + public static function safeFileNameFromUrl(string $url): string + { + $stripped = strtok($url, '?#'); + $ext = strtolower(pathinfo((string)$stripped, PATHINFO_EXTENSION)); + if (!preg_match('/^[a-z0-9]{1,5}$/i', $ext)) $ext = 'jpg'; + return bin2hex(random_bytes(10)).'.'.$ext; + } + + private static function isValidHttpUrl(string $url): bool + { + if (!filter_var($url, FILTER_VALIDATE_URL)) return false; + $p = parse_url($url); + if (!$p || empty($p['scheme']) || empty($p['host'])) return false; + $s = strtolower($p['scheme']); + return $s === 'http' || $s === 'https'; + } + + private static function originFromUrl(string $url): string + { + $p = parse_url($url); + if (!$p || empty($p['scheme']) || empty($p['host'])) return ''; + $port = ''; + if (!empty($p['port'])) { + $default = ($p['scheme'] === 'https') ? 443 : 80; + if ((int)$p['port'] !== $default) $port = ':'.$p['port']; + } + return $p['scheme'].'://'.$p['host'].$port.'/'; + } + + private static function hostResolvesPublic(string $host): bool + { + $recs = @dns_get_record($host, DNS_A + DNS_AAAA); + if (!$recs || !count($recs)) return false; + foreach ($recs as $r) { + $ip = $r['type'] === 'A' ? ($r['ip'] ?? null) : ($r['ipv6'] ?? null); + if (!$ip) continue; + if (self::isPrivateIp($ip)) return false; + } + return true; + } + + private static function isPrivateIp(string $ip): bool + { + if (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) { + $cidrs = ['10.0.0.0/8','172.16.0.0/12','192.168.0.0/16','127.0.0.0/8','169.254.0.0/16']; + } elseif (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) { + $cidrs = ['::1/128','fc00::/7','fe80::/10']; + } else { + return true; + } + foreach ($cidrs as $c) if (self::ipInCidr($ip, $c)) return true; + return false; + } + + private static function ipInCidr(string $ip, string $cidr): bool + { + if (strpos($cidr, ':') !== false) { + [$subnet, $mask] = array_pad(explode('/', $cidr, 2), 2, null); + $mask = (int)$mask; + $binIp = inet_pton($ip); + $binSubnet = inet_pton($subnet); + if ($binIp === false || $binSubnet === false) return false; + $bytes = intdiv($mask, 8); + $bits = $mask % 8; + if ($bytes && substr($binIp, 0, $bytes) !== substr($binSubnet, 0, $bytes)) return false; + if ($bits) { + $b1 = ord($binIp[$bytes]) & (0xFF << (8 - $bits)); + $b2 = ord($binSubnet[$bytes]) & (0xFF << (8 - $bits)); + return $b1 === $b2; + } + return true; + } else { + [$subnet, $mask] = array_pad(explode('/', $cidr, 2), 2, null); + $mask = (int)$mask; + $ipL = ip2long($ip); + $subL = ip2long($subnet); + if ($ipL === false || $subL === false) return false; + $maskL = -1 << (32 - $mask); + return (($ipL & $maskL) === ($subL & $maskL)); + } + } + + private static function fail(?string $tmp, ?string $cerr, int $http, string $msg): array + { + if ($tmp && is_file($tmp)) @unlink($tmp); + return [ + 'ok' => false, + 'tmp_path' => null, + 'mime' => null, + 'http_code' => $http > 0 ? $http : null, + 'curl_err' => $cerr, + 'final_url' => null, + 'bytes' => 0, + 'error' => $msg, + ]; + } +} diff --git a/item.php b/item.php index 1828b2a..ba1c9df 100644 --- a/item.php +++ b/item.php @@ -1,6 +1,10 @@ 0) { - $ok = false; - foreach ($image_host_whitelist as $allowed) { - if (strcasecmp($host, $allowed) === 0) { - $ok = true; - break; - } - if (preg_match('/\.' . preg_quote($allowed, '/') . '$/i', $host)) { - $ok = true; - break; - } - } - if (!$ok) - fail('Host nicht erlaubt', 400); - } - $recs = dns_get_record($host, DNS_A + DNS_AAAA); - if (!$recs || !count($recs)) - fail('Host nicht auflösbar', 400); - foreach ($recs as $r) { - $ip = $r['type'] === 'A' ? ($r['ip'] ?? null) : ($r['ipv6'] ?? null); - if (!$ip) - continue; - if (is_private_ip($ip)) - fail('Zieladresse unzulässig', 400); - } -} -function download_remote_image_limited(string $url, int $maxBytes = 5_000_000, int $timeout = 8): string -{ - $tmp = tempnam(sys_get_temp_dir(), 'wlimg_'); - if ($tmp === false) - fail('Temp-Datei Fehler', 500); - $fh = fopen($tmp, 'wb'); - if ($fh === false) { - @unlink($tmp); - fail('Temp-Datei Fehler', 500); - } - $ch = curl_init($url); - if ($ch === false) { - fclose($fh); - @unlink($tmp); - fail('Download Fehler', 500); - } - $received = 0; - curl_setopt_array($ch, [ - CURLOPT_FOLLOWLOCATION => true, - CURLOPT_MAXREDIRS => 3, - CURLOPT_CONNECTTIMEOUT => 3, - CURLOPT_TIMEOUT => $timeout, - CURLOPT_USERAGENT => 'wishlist/1.0', - CURLOPT_SSL_VERIFYPEER => true, - CURLOPT_SSL_VERIFYHOST => 2, - CURLOPT_WRITEFUNCTION => function ($ch, $data) use (&$received, $maxBytes, $fh) { - $len = strlen($data); - $received += $len; - if ($received > $maxBytes) - return 0; - return fwrite($fh, $data); - } - ]); - $ok = curl_exec($ch); - $code = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE); - curl_close($ch); - fclose($fh); - if (!$ok || $code < 200 || $code >= 300) { - @unlink($tmp); - fail('Bild-Download fehlgeschlagen', 400); - } - return $tmp; -} -function safe_image_filename_from_url(string $url): string -{ - $stripped = strtok($url, '?#'); - $ext = strtolower(pathinfo((string) $stripped, PATHINFO_EXTENSION)); - if (!preg_match('/^[a-z0-9]{1,5}$/i', $ext)) - $ext = 'jpg'; - return bin2hex(random_bytes(10)) . '.' . $ext; -} /* ============= Controller ============= */ @@ -269,38 +144,50 @@ if ($ItemLink !== '' && !is_valid_http_url($ItemLink)) /* Optional: Bild von externer URL holen */ $imageLocalLink = null; if (!$removeImage && $ItemImageUrl !== '') { - if (!is_valid_http_url($ItemImageUrl)) { + + $whitelist = $image_host_whitelist ?? null; + + $fetch = ImageFetch::download($ItemImageUrl, [ + 'max_bytes' => 8_000_000, + 'timeout' => 12, + 'connect_timeout' => 5, + 'retries' => 4, + 'retry_backoff_ms' => 300, + 'whitelist_hosts' => $whitelist, + 'ip_resolve_v4' => true, + 'referer' => 'auto', + 'log_prefix' => 'wishlist-img', + ]); + + if (!$fetch['ok']) { + error_log("wishlist image error: http=" . ($fetch['http_code'] ?? 0) . " curl=" . ($fetch['curl_err'] ?? '-') . " url=$ItemImageUrl"); $conn->close(); - fail('Ungültiger Bildlink', 400); + fail('Bild-Download fehlgeschlagen', 400); } - validate_remote_host_not_private($ItemImageUrl); - $tmp = download_remote_image_limited($ItemImageUrl, 5_000_000, 8); - $info = @getimagesize($tmp); - if ($info === false || empty($info['mime']) || stripos($info['mime'], 'image/') !== 0) { - @unlink($tmp); + + $info = @getimagesize($fetch['tmp_path']); + $mime = $info['mime'] ?? $fetch['mime'] ?? 'image/*'; + if (stripos($mime, 'image/') !== 0) { + @unlink($fetch['tmp_path']); $conn->close(); fail('Link ist kein gültiges Bild', 400); } + global $imagedir; - if (!is_dir($imagedir)) { + if (!is_dir($imagedir)) @mkdir($imagedir, 0755, true); - } - $filename = safe_image_filename_from_url($ItemImageUrl); + $filename = ImageFetch::safeFileNameFromUrl($ItemImageUrl); $target = rtrim($imagedir, '/') . '/' . $filename; - if (!@rename($tmp, $target)) { - // Fallback falls rename scheitert - if (!@copy($tmp, $target)) { - @unlink($tmp); + if (!@rename($fetch['tmp_path'], $target)) { + if (!@copy($fetch['tmp_path'], $target)) { + @unlink($fetch['tmp_path']); $conn->close(); fail('Bildspeicherung fehlgeschlagen', 500); } - @unlink($tmp); + @unlink($fetch['tmp_path']); } - - // HIER: Permissions fixen @chmod($target, 0644); - $imageLocalLink = $filename; }