bool, * 'tmp_path' => string|null, * 'mime' => string|null, * 'http_code' => int|null, * 'curl_err' => string|null, * 'final_url' => string|null, * 'bytes' => int, * ] */ namespace WList\Net; final class ImageFetch { /** Default User-Agents (rotieren pro Versuch) */ private static array $UA_LIST = [ // Aktuelle Desktop-Chromes/Firefox als Tarnkappe 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15', 'Mozilla/5.0 (X11; Linux x86_64; rv:129.0) Gecko/20100101 Firefox/129.0', ]; /** Fehlercodes, bei denen sich ein Retry lohnt */ private static array $RETRY_HTTP = [429, 500, 502, 503, 504, 520, 521, 522, 523, 524]; private static array $RETRY_CURL = [ CURLE_OPERATION_TIMEDOUT, CURLE_COULDNT_RESOLVE_HOST, CURLE_COULDNT_CONNECT, CURLE_RECV_ERROR, CURLE_SEND_ERROR, CURLE_GOT_NOTHING, CURLE_HTTP2_STREAM, // HTTP/2 stream error/RESET ]; /** Öffentliche API */ public static function download(string $url, array $opt = []): array { $defaults = [ 'max_bytes' => 8_000_000, // 8 MiB 'timeout' => 12, // Sek. 'connect_timeout' => 5, // Sek. 'max_redirects' => 5, 'retries' => 3, 'retry_backoff_ms' => 250, // Basis-Backoff 'whitelist_hosts' => null, // ['ikea.com','images.ikea.com'] oder null 'ip_resolve_v4' => true, 'referer' => 'auto', // 'auto' | 'none' | 'custom' 'custom_referer' => null, 'user_agents' => null, // override UA-Liste 'log_prefix' => 'imgfetch', // für error_log ]; $cfg = array_replace($defaults, $opt); // 1) URL validieren + Host prüfen if (!self::isValidHttpUrl($url)) { return self::fail(null, null, 0, 'Ungültige URL'); } $p = parse_url($url); $host = strtolower($p['host'] ?? ''); if (!$host) { return self::fail(null, null, 0, 'Ungültige URL (Host)'); } // Host-Whitelist (optional) if (is_array($cfg['whitelist_hosts']) && count($cfg['whitelist_hosts']) > 0) { $ok = false; foreach ($cfg['whitelist_hosts'] as $allowed) { $allowed = strtolower($allowed); if ($host === $allowed || str_ends_with($host, '.'.$allowed)) { $ok = true; break; } } if (!$ok) { return self::fail(null, null, 0, 'Host nicht erlaubt'); } } // DNS → keine privaten IPs if (!self::hostResolvesPublic($host)) { return self::fail(null, null, 0, 'Host nicht öffentlich erreichbar'); } // 2) Tmpfile anlegen $tmp = tempnam(sys_get_temp_dir(), 'wlimg_'); if ($tmp === false) { return self::fail(null, null, 0, 'Temp-Datei Fehler'); } // 3) Vorbereitung: Header + Referer + UAs $uaList = is_array($cfg['user_agents']) && $cfg['user_agents'] ? $cfg['user_agents'] : self::$UA_LIST; $originRef = self::originFromUrl($url); $referer = match ($cfg['referer']) { 'none' => null, 'custom'=> (string)$cfg['custom_referer'], default => $originRef, // auto }; $headers = [ 'Accept: image/avif,image/webp,image/*;q=0.8,*/*;q=0.5', 'Accept-Language: de-DE,de;q=0.9,en;q=0.8', 'Cache-Control: no-cache', 'Pragma: no-cache', // Friendly fetch hints (einige CDNs schauen da drauf) 'Sec-Fetch-Dest: image', 'Sec-Fetch-Mode: no-cors', 'Sec-Fetch-Site: cross-site', ]; // 4) Retries $attempts = max(1, (int)$cfg['retries']); $received = 0; $lastHttp = null; $lastCurlErr = null; $finalUrl = null; $mime = null; $ok = false; for ($i = 0; $i < $attempts; $i++) { $ua = $uaList[$i % count($uaList)]; $fh = fopen($tmp, 'wb'); if ($fh === false) { return self::fail($tmp, null, 0, 'Temp-Datei Fehler'); } $ch = curl_init($url); if ($ch === false) { fclose($fh); return self::fail($tmp, null, 0, 'Download Fehler (init)'); } $received = 0; $opts = [ CURLOPT_FOLLOWLOCATION => true, CURLOPT_MAXREDIRS => (int)$cfg['max_redirects'], CURLOPT_CONNECTTIMEOUT => (int)$cfg['connect_timeout'], CURLOPT_TIMEOUT => (int)$cfg['timeout'], CURLOPT_USERAGENT => $ua, CURLOPT_SSL_VERIFYPEER => true, CURLOPT_SSL_VERIFYHOST => 2, CURLOPT_HTTPHEADER => $headers, CURLOPT_HEADER => false, CURLOPT_RETURNTRANSFER => false, // stream direkt ins Filehandle CURLOPT_FILE => $fh, // fallback, falls WRITEFUNCTION nicht greift CURLOPT_WRITEFUNCTION => function ($ch, $data) use (&$received, $cfg, $fh) { $len = strlen($data); $received += $len; if ($received > (int)$cfg['max_bytes']) { return 0; // -> CURLE_WRITE_ERROR } return fwrite($fh, $data); }, CURLOPT_ACCEPT_ENCODING => '', // gzip/br zulassen ]; if ($cfg['ip_resolve_v4']) { $opts[CURLOPT_IPRESOLVE] = CURL_IPRESOLVE_V4; } if ($referer) { $opts[CURLOPT_REFERER] = $referer; } curl_setopt_array($ch, $opts); $exec = curl_exec($ch); $http = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE); $ctype = (string) (curl_getinfo($ch, CURLINFO_CONTENT_TYPE) ?: ''); $finalUrl = (string) (curl_getinfo($ch, CURLINFO_EFFECTIVE_URL) ?: $url); $cerr = curl_errno($ch); $cerrStr = curl_error($ch); curl_close($ch); fclose($fh); $lastHttp = $http; $lastCurlErr = $cerrStr; // Abbruch durch Größenlimit → als 413 semantisch behandeln (Payload Too Large) if ($exec === false && $cerr === CURLE_WRITE_ERROR && $received > (int)$cfg['max_bytes']) { @unlink($tmp); error_log("{$cfg['log_prefix']} size limit hit after {$received} bytes url=$url"); return self::fail(null, null, 413, 'Bild zu groß'); } // Erfolgspfad: 2xx + image/* if ($exec !== false && $http >= 200 && $http < 300 && stripos($ctype, 'image/') === 0) { $mime = $ctype; $ok = true; break; } // Nicht-Image trotz 2xx → Blockseite/HTML/CSS etc. if ($exec !== false && $http >= 200 && $http < 300 && stripos($ctype, 'image/') !== 0) { // Manche Server liefern leeren/fehlenden Content-Type. Letzter Rettungsanker: magic sniff via getimagesize $probe = @getimagesize($tmp); if ($probe !== false) { $mime = $probe['mime'] ?? 'image/*'; $ok = true; break; } // sonst retry @unlink($tmp); $doRetry = ($i + 1) < $attempts; error_log("{$cfg['log_prefix']} bad ctype http=$http ctype={$ctype} retry=".($doRetry?'1':'0')." url=$url"); } else { // Fehler oder Non-2xx → ggf. retry @unlink($tmp); $doRetry = ($i + 1) < $attempts && (in_array($http, self::$RETRY_HTTP, true) || in_array($cerr, self::$RETRY_CURL, true) || $http === 0); error_log("{$cfg['log_prefix']} fail http=$http curl={$cerr}:{$cerrStr} ua#{$i} retry=".($doRetry?'1':'0')." url=$url"); } // Backoff + Jitter vorm nächsten Versuch if (($i + 1) < $attempts) { $sleepMs = (int)($cfg['retry_backoff_ms'] * (2 ** $i) + random_int(0, 150)); usleep($sleepMs * 1000); } // neues Tmp für nächsten Versuch $tmp = tempnam(sys_get_temp_dir(), 'wlimg_'); if ($tmp === false) { return self::fail(null, null, 0, 'Temp-Datei Fehler'); } } if (!$ok) { return self::fail(null, $lastCurlErr, $lastHttp ?? 0, 'Bild-Download fehlgeschlagen'); } return [ 'ok' => true, 'tmp_path' => $tmp, 'mime' => $mime, 'http_code' => $lastHttp ?? 200, 'curl_err' => null, 'final_url' => $finalUrl, 'bytes' => $received, ]; } /** Hilfsfunktionen */ public static function safeFileNameFromUrl(string $url): string { $stripped = strtok($url, '?#'); $ext = strtolower(pathinfo((string)$stripped, PATHINFO_EXTENSION)); if (!preg_match('/^[a-z0-9]{1,5}$/i', $ext)) $ext = 'jpg'; return bin2hex(random_bytes(10)).'.'.$ext; } private static function isValidHttpUrl(string $url): bool { if (!filter_var($url, FILTER_VALIDATE_URL)) return false; $p = parse_url($url); if (!$p || empty($p['scheme']) || empty($p['host'])) return false; $s = strtolower($p['scheme']); return $s === 'http' || $s === 'https'; } private static function originFromUrl(string $url): string { $p = parse_url($url); if (!$p || empty($p['scheme']) || empty($p['host'])) return ''; $port = ''; if (!empty($p['port'])) { $default = ($p['scheme'] === 'https') ? 443 : 80; if ((int)$p['port'] !== $default) $port = ':'.$p['port']; } return $p['scheme'].'://'.$p['host'].$port.'/'; } private static function hostResolvesPublic(string $host): bool { $recs = @dns_get_record($host, DNS_A + DNS_AAAA); if (!$recs || !count($recs)) return false; foreach ($recs as $r) { $ip = $r['type'] === 'A' ? ($r['ip'] ?? null) : ($r['ipv6'] ?? null); if (!$ip) continue; if (self::isPrivateIp($ip)) return false; } return true; } private static function isPrivateIp(string $ip): bool { if (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) { $cidrs = ['10.0.0.0/8','172.16.0.0/12','192.168.0.0/16','127.0.0.0/8','169.254.0.0/16']; } elseif (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) { $cidrs = ['::1/128','fc00::/7','fe80::/10']; } else { return true; } foreach ($cidrs as $c) if (self::ipInCidr($ip, $c)) return true; return false; } private static function ipInCidr(string $ip, string $cidr): bool { if (strpos($cidr, ':') !== false) { [$subnet, $mask] = array_pad(explode('/', $cidr, 2), 2, null); $mask = (int)$mask; $binIp = inet_pton($ip); $binSubnet = inet_pton($subnet); if ($binIp === false || $binSubnet === false) return false; $bytes = intdiv($mask, 8); $bits = $mask % 8; if ($bytes && substr($binIp, 0, $bytes) !== substr($binSubnet, 0, $bytes)) return false; if ($bits) { $b1 = ord($binIp[$bytes]) & (0xFF << (8 - $bits)); $b2 = ord($binSubnet[$bytes]) & (0xFF << (8 - $bits)); return $b1 === $b2; } return true; } else { [$subnet, $mask] = array_pad(explode('/', $cidr, 2), 2, null); $mask = (int)$mask; $ipL = ip2long($ip); $subL = ip2long($subnet); if ($ipL === false || $subL === false) return false; $maskL = -1 << (32 - $mask); return (($ipL & $maskL) === ($subL & $maskL)); } } private static function fail(?string $tmp, ?string $cerr, int $http, string $msg): array { if ($tmp && is_file($tmp)) @unlink($tmp); return [ 'ok' => false, 'tmp_path' => null, 'mime' => null, 'http_code' => $http > 0 ? $http : null, 'curl_err' => $cerr, 'final_url' => null, 'bytes' => 0, 'error' => $msg, ]; } }