[ "dir" => "周韦彤", "code" => 1456, "name" => "周韦彤" ], "ycc" => [ "dir" => "ycc", "code" => 459, "name" => "杨晨晨" ], "ry" => [ "dir" => "忍野さら", "code" => "1875", "name" => "忍野さら" ], "azu" => [ "dir" => "azu", "code" => 437, "name" => "阿朱" ], "xq" => [ "dir" => "xq", "code" => 2438, "name" => "小琪" ], "ygh" => [ "dir" => "ygh", "code" => 550, "name" => "原干惠" ], "wyc" => [ "dir" => "wyc", "code" => 293, "name" => "王语纯" ], "zz" => [ "dir" => "zz", "code" => 954, "name" => "芝芝 booty" ], "hlr" => [ "dir" => "hlr", "code" => 1289, "name" => "黄乐然" ], "jrq" => [ "dir" => "jrq", "code" => 5034, "name" => "姜仁卿" ], "ny" => [ "dir" => "ny", "code" => 5301, "name" => "奈月" ], "杉本有美" => [ "dir" => "杉本有美", "code" => 632, "name" => "杉本有美" ], "糯美子" => [ "dir" => "糯美子", "code" => 161, "name" => "糯美子" ], "小雪" => [ "dir" => "小雪", "code" => 388, "name" => "小雪" ] ]; public function scrapeTujiguGirls($isAll = false) { // self::$name_dir = array_slice(self::$name_dir, 1, 1); foreach (self::$name_dir as $username => $name) { // 重置下这个 map,不同网站,不同人可能会有相册 id 重复的情况 $albumCodeMap = []; // $baseDir = "/Users/shixuesen/Documents/tmp/image/xg/" . $name['dir'] . "/"; $baseDir = "/Volumes/intel660p/image/xg/" . $name["dir"] . "/"; // 处理两个网站对应图册名不一致,直接取图册 id 作为唯一性判断的 map if (is_dir($baseDir)) { $albumDirList = scandir($baseDir); foreach ($albumDirList as $albumDir) { if ($albumDir == "." || $albumDir == "..") { continue; } if (is_dir($baseDir . $albumDir) && is_numeric(explode("-", $albumDir)[0])) { $albumCodeMap[explode("-", $albumDir)[0]] = $albumDir; } } } $baseUrl = "https://www.tujidao01.com/"; usleep(random_int(100, 1000) * 1000); $peopleUrl = "https://www.tujidao01.com/t/?id="; $peopleUrl .= $name['code']; // 获取总的相册数量 $albumNumSelector = "body > div:nth-child(4) > span"; $albumNumSelector = "#pages > div > a:last-child"; echo "111111111"; $content = null; try { $content = $this->getContent($peopleUrl); $baseQl = QueryList::getInstance()->setHtml($content); } catch (Exception $e) { dump($e->getMessage()); } echo "222222222"; $ql = $baseQl->find($albumNumSelector)->attrs("href"); $onlyOnePage = false; // 相册页数 if (count($ql->all()) == 0) { // 不超过 1 页时,不显示数量 $totalAlbumPage = 1; $onlyOnePage = true; } else { preg_match("#page=(\d+)#", $ql->all()[0], $result); dump($result); $totalAlbumPage = 1; if (is_numeric($result[1])) { $totalAlbumPage = $result[1]; } } $baseAlbumUrl = "https://www.tujidao01.com/t/?id={$name['code']}&page="; // 表示限制抓取相册数 $countLimit = 8; if ($isAll) { $countLimit = 5000; } echo "totalAlbumPage is $totalAlbumPage\n"; for ($i = 1; $i <= $totalAlbumPage; $i++) { if ($onlyOnePage || $totalAlbumPage == 1 || $i == 1) { $albumQl = QueryList::getInstance()->setHtml($content); } else { $content = $this->getContent($baseAlbumUrl . $i); $albumQl = QueryList::getInstance()->setHtml($content); } // dump($albumQl->getHtml()); $rules = [ 'num' => ['span.shuliang', 'text'], 'title' => ['p.biaoti', 'text'], 'img' => ['a>img', 'src'], "code" => ['p.biaoti a', "href"] ]; $range = "div.hezi>ul li"; // "body > div.hezi > ul > li" $albumList = $albumQl->rules($rules)->range($range)->query()->getData()->all(); foreach ($albumList as $album) { if ($countLimit <= 0) { dump("相册已超过限制数量,跳出"); break 2; } dump("current album page no: " . $i); usleep(1000 * random_int(100, 1000)); dump("相册:", [$album]); $title = $album["title"]; $titleStr = preg_replace("#/#", "-", $title); preg_match("#(\d+)P#", $album["num"], $result); $totalImageNum = $result[1]; for ($j = 1; $j <= $totalImageNum; $j++) { // $albumCode = substr($album, 25, 5); preg_match("#id\=(\d+)#", $album["code"], $albumCodeResult); $albumCode = $albumCodeResult[1]; $baseImageUrl = "https://tjg.gzhuibei.com/a/1/{$albumCode}/"; // if ($j == 0) { // $imageName = $j . ".jpg"; // } else { // $suffix = str_pad($j, 3, "0", STR_PAD_LEFT); // $imageName = $suffix . ".jpg"; // } $imageName = $j . ".jpg"; $imageUrl = $baseImageUrl . $imageName; // if ($j == 0) { // $imageName = "000.jpg"; // } $imageName = $username . "-" . $albumCode . "-" . $imageName; $albumPath = ""; if (!file_exists($baseDir . $albumCode . "-" . $titleStr) && !array_key_exists($albumCode, $albumCodeMap)) { dump($baseDir . $albumCode . "-" . $titleStr); mkdir($baseDir . $albumCode . "-" . $titleStr); } if (array_key_exists($albumCode, $albumCodeMap)) { $albumPath = $baseDir . $albumCodeMap[$albumCode]; } else { $albumPath = $baseDir . $albumCode . "-" . $titleStr; } if (file_exists($albumPath . "/" . $imageName)) { dump($albumPath . "/" . $imageName . " exists. skipped!"); continue; } $curl_handle = curl_init(); curl_setopt_array($curl_handle, array( CURLOPT_URL => $imageUrl, CURLOPT_RETURNTRANSFER => true, CURLOPT_ENCODING => '', CURLOPT_MAXREDIRS => 10, CURLOPT_TIMEOUT => 0, CURLOPT_FOLLOWLOCATION => true, CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, CURLOPT_CUSTOMREQUEST => 'GET', CURLOPT_HTTPHEADER => array( 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8', 'Accept-Language: zh-CN,zh;q=0.9', 'Cache-Control: no-cache', 'Connection: keep-alive', 'Pragma: no-cache', 'Referer: https://www.tujidao01.com/', 'Sec-Fetch-Dest: image', 'Sec-Fetch-Mode: no-cors', 'Sec-Fetch-Site: cross-site', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36', 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', 'sec-ch-ua-mobile: ?0', 'sec-ch-ua-platform: "macOS"' ), )); $query = curl_exec($curl_handle); $i = 1; while ($query === false) { echo 'Curl error: ' . curl_error($curl_handle) . "\n"; echo "retry times: " . $i++ . " times \n"; sleep(1); $sleepTime = 1000 * random_int(100, 1000); echo "retry sleep {$sleepTime} nano second \n"; usleep($sleepTime); $query = curl_exec($curl_handle); if ($i >= 100) { break; } } echo curl_error($curl_handle); $fp = fopen($albumPath . "/" . $imageName, 'x'); fwrite($fp, $query); fclose($fp); $sleepTime = 1000 * random_int(100, 1000); echo "after write image sleep {$sleepTime} nano second \n"; usleep($sleepTime); echo $imageUrl; } // exit; $countLimit--; } } } } public function getContent($url) { $curl = curl_init(); curl_setopt_array($curl, array( CURLOPT_URL => "$url", CURLOPT_RETURNTRANSFER => true, CURLOPT_ENCODING => '', CURLOPT_MAXREDIRS => 10, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true, CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, CURLOPT_CUSTOMREQUEST => 'GET', CURLOPT_HTTPHEADER => array( 'authority: www.tujidao01.com', 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-language: zh-CN,zh;q=0.9', 'cache-control: max-age=0', 'cookie: PHPSESSID=ndm118vli42e1db7dfhqmvgjo7; __51vcke__Je64MI06Q1Neac4F=3d9a0d91-cf15-5bf7-ab90-90734f856aba; __51vuft__Je64MI06Q1Neac4F=1654567556100; uid=315696; name=nicksxs; leixing=0; __51uvsct__Je64MI06Q1Neac4F=2; __vtins__Je64MI06Q1Neac4F=%7B%22sid%22%3A%20%22d5d48e8b-a16e-5451-95f4-e629e6a4ec1b%22%2C%20%22vd%22%3A%205%2C%20%22stt%22%3A%20287951%2C%20%22dr%22%3A%205812%2C%20%22expires%22%3A%201654862307975%2C%20%22ct%22%3A%201654860507975%7D', 'referer: https://www.tujidao01.com/sousu/?s0=%E6%9D%A8%E6%99%A8%E6%99%A8', 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', 'sec-ch-ua-mobile: ?0', 'sec-ch-ua-platform: "macOS"', 'sec-fetch-dest: document', 'sec-fetch-mode: navigate', 'sec-fetch-site: same-origin', 'sec-fetch-user: ?1', 'upgrade-insecure-requests: 1', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36' ), )); $response = curl_exec($curl); curl_close($curl); return $response; } }