[ "dir" => "周韦彤", "code" => 1456, "name" => "周韦彤" ], "ycc" => [ "dir" => "ycc", "code" => 459, "name" => "杨晨晨" ], "ry" => [ "dir" => "忍野さら", "code" => "1875", "name" => "忍野さら" ], "azu" => [ "dir" => "azu", "code" => 437, "name" => "阿朱" ], "xq" => [ "dir" => "xq", "code" => 2438, "name" => "小琪" ], "ygh" => [ "dir" => "ygh", "code" => 550, "name" => "原干惠" ], "wyc" => [ "dir" => "wyc", "code" => 293, "name" => "王语纯" ], "zz" => [ "dir" => "zz", "code" => 954, "name" => "芝芝 booty" ], "hlr" => [ "dir" => "hlr", "code" => 1289, "name" => "黄乐然" ], "jrq" => [ "dir" => "jrq", "code" => 5034, "name" => "姜仁卿" ], "ny" => [ "dir" => "ny", "code" => 5301, "name" => "奈月" ], "杉本有美" => [ "dir" => "杉本有美", "code" => 632, "name" => "杉本有美" ], "糯美子" => [ "dir" => "糯美子", "code" => 161, "name" => "糯美子" ] ]; public function scrapeTujiguGirls() { foreach (self::$name_dir as $username => $name) { // 重置下这个 map,不同网站,不同人可能会有相册 id 重复的情况 $albumCodeMap = []; // $baseDir = "/Users/shixuesen/Documents/tmp/image/xg/" . $name['dir'] . "/"; $baseDir = "/Volumes/intel660p/image/xg/" . $name["dir"] . "/"; // 处理两个网站对应图册名不一致,直接取图册 id 作为唯一性判断的 map if (is_dir($baseDir)) { $albumDirList = scandir($baseDir); foreach ($albumDirList as $albumDir) { if ($albumDir == "." || $albumDir == "..") { continue; } if (is_dir($baseDir . $albumDir) && is_numeric(explode("-", $albumDir)[0])) { $albumCodeMap[explode("-", $albumDir)[0]] = $albumDir; } } } $baseUrl = "https://www.tujigu.com/"; usleep(random_int(1000, 10000) * 1000); $peopleUrl = "https://www.tujigu.com/t/"; $peopleUrl .= $name['code']; // 获取总的相册数量 $albumNumSelector = "body > div:nth-child(4) > span"; $baseQl = QueryList::get($peopleUrl); $ql = $baseQl->find($albumNumSelector)->htmls(); dump($ql->all()); $onlyOnePage = false; // 相册页数 if (count($ql->all()) == 0) { // 不超过 1 页时,不显示数量 $totalAlbumPage = 1; $onlyOnePage = true; } else { preg_match("#\d+#", $ql->all()[0], $result); dump($result); $totalAlbumNum = 0; if (is_numeric($result[0])) { $totalAlbumNum = $result[0]; } $totalAlbumPage = ceil($totalAlbumNum / 40); } $baseAlbumUrl = "https://www.tujigu.com/t/{$name['code']}/"; for ($i = 0; $i < $totalAlbumPage; $i++) { if ($onlyOnePage || $totalAlbumPage == 1 || $i == 0) { $albumQl = QueryList::get($peopleUrl); } else { $albumQl = QueryList::get($baseAlbumUrl . "index_".$i . ".html"); } // dump($albumQl->getHtml()); $albumList = $albumQl->find("body > div.hezi > ul > li > a")->attrs("href"); $pageAlbum = $albumList->all(); dump($pageAlbum); foreach ($pageAlbum as $album) { dump("current album page no: " . $i); usleep(10000 * random_int(1000, 10000)); dump("相册:", [$album]); $pageQL = QueryList::get( $album); $page = $pageQL->find("body > div.tuji > p:nth-child(5)")->htmls(); $pageAlternative = $pageQL->find("body > div.tuji > p:nth-child(6)")->htmls(); $title = $pageQL->find("body > div.tuji > div.weizhi > h1")->htmls(); dump("pageTitle all", [$title->all(), $album]); $titleStr = $title->all()[0]; $titleStr = preg_replace("#/#", "-", $titleStr); preg_match("#图片数量: (\d+)P#", $page->all()[0], $result); if (count($result) < 2) { preg_match("#图片数量: (\d+)P#", $pageAlternative->all()[0], $result); } $totalImageNum = $result[1]; for ($j = 1; $j <= $totalImageNum; $j++) { // $albumCode = substr($album, 25, 5); $albumCode = explode("/", $album)[4]; $baseImageUrl = "https://lns.hywly.com/a/1/{$albumCode}/"; // if ($j == 0) { // $imageName = $j . ".jpg"; // } else { // $suffix = str_pad($j, 3, "0", STR_PAD_LEFT); // $imageName = $suffix . ".jpg"; // } $imageName = $j . ".jpg"; $imageUrl = $baseImageUrl . $imageName; // if ($j == 0) { // $imageName = "000.jpg"; // } $imageName = $username . "-" . $albumCode . "-" . $imageName; $albumPath = ""; if (!file_exists($baseDir . $albumCode . "-" . $titleStr) && !array_key_exists($albumCode, $albumCodeMap)) { dump($baseDir . $albumCode . "-" . $titleStr); mkdir($baseDir . $albumCode . "-" . $titleStr); } if (array_key_exists($albumCode, $albumCodeMap)) { $albumPath = $baseDir . $albumCodeMap[$albumCode]; } else { $albumPath = $baseDir . $albumCode . "-" . $titleStr; } if (file_exists($albumPath . "/" . $imageName)) { dump($albumPath . "/" . $imageName . " exists. skipped!"); continue; } $opts = array('http' => ['header' => "User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n Referer:" . $baseUrl . "\r\n" ]); $curl_handle = curl_init(); curl_setopt($curl_handle, CURLOPT_URL, $imageUrl); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'); curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl); $query = curl_exec($curl_handle); $i = 1; while ($query === false) { echo 'Curl error: ' . curl_error($curl_handle) . "\n"; echo "retry times: " . $i++ . " times \n"; sleep(1); $sleepTime = 1000 * random_int(1000, 10000); echo "retry sleep {$sleepTime} nano second \n"; usleep($sleepTime); $query = curl_exec($curl_handle); if ($i >= 100) { break; } } echo curl_error($curl_handle); $fp = fopen($albumPath . "/" . $imageName, 'x'); fwrite($fp, $query); fclose($fp); $sleepTime = 1000 * random_int(100, 1000); echo "after write image sleep {$sleepTime} nano second \n"; usleep($sleepTime); echo $imageUrl; } dump($page->all()); // exit; } } } } }