[ "dir" => "周韦彤", "code" => 16274, "name" => "周韦彤" ], "ycc" => [ "dir" => "ycc", "code" => 22162, "name" => "杨晨晨" ], "ry" => [ "dir" => "忍野さら", "code" => "21250", "name" => "忍野さら" ], "azu" => [ "dir" => "azu", "code" => 26002, "name" => "阿朱" ], "xq" => [ "dir" => "xq", "code" => 22204, "name" => "小琪" ], "ygh" => [ "dir" => "ygh", "code" => 15902, "name" => "原干惠" ], "wyc" => [ "dir" => "wyc", "code" => 19702, "name" => "王语纯" ], "zz" => [ "dir" => "zz", "code" => 22899, "name" => "芝芝 booty" ], "hlr" => [ "dir" => "hlr", "code" => 20015, "name" => "黄乐然" ], "jrq" => [ "dir" => "jrq", "code" => 26560, "name" => "姜仁卿" ], "ny" => [ "dir" => "ny", "code" => 26298, "name" => "奈月" ], "杉本有美" => [ "dir" => "杉本有美", "code" => 15939, "name" => "杉本有美" ] ]; public function scrapeNvshenGirls() { // 15902 原干惠 // 22162 杨晨晨 // 19702 王语纯 // 22899 芝芝 booty // 20015 黄乐然 // 26560 姜仁卿 $NUM_OF_ATTEMPTS = 50; foreach (self::$name_dir as $name) { $baseDir = "/Users/shixuesen/Documents/tmp/image/xg/" . $name['dir'] . "/"; $baseUrl = "https://www.nvshens.net"; usleep(random_int(1000, 10000) * 1000); $peopleUrl = "https://www.nvshens.net/girl/"; $peopleUrl .= $name['code']; // 获取总的相册数量 $albumNumSelector = ".archive_more > a"; $baseQl = QueryList::get($peopleUrl); $ql = $baseQl->find($albumNumSelector)->htmls(); $onlyOnePage = false; // 相册页数 if (count($ql->all()) == 0) { // 不超过 1 页时,不显示数量 $totalAlbumPage = 1; $onlyOnePage = true; } else { preg_match("#\d+#", $ql->all()[0], $result); dump($result); $totalAlbumNum = 0; if (is_numeric($result[0])) { $totalAlbumNum = $result[0]; } $totalAlbumPage = ceil($totalAlbumNum / 30); } $baseAlbumUrl = "https://www.nvshens.net/girl/{$name['code']}/album/"; for ($i = 1; $i <= $totalAlbumPage; $i++) { if ($onlyOnePage) { $albumQl = QueryList::get($peopleUrl); } else { $albumQl = QueryList::get($baseAlbumUrl . $i . ".html"); } // dump($albumQl->getHtml()); $albumList = $albumQl->find(".igalleryli > .igalleryli_div > .igalleryli_link")->attrs("href"); $pageAlbum = $albumList->all(); foreach ($pageAlbum as $album) { dump("相册:", [$album]); $pageQL = QueryList::get($baseUrl . $album); $page = $pageQL->find(".albumInfo > span")->htmls(); $title = $pageQL->find(".albumTitle > #htilte")->htmls(); dump($title->all()); $titleStr = $title->all()[0]; preg_match("#\d+#", $page->all()[0], $result); $totalImageNum = $result[0]; for ($j = 0; $j < $totalImageNum; $j++) { $albumCode = substr($album, 3, 5); $baseImageUrl = "https://t1.onvshen.com:85/gallery/{$name['code']}/{$albumCode}/"; if ($j == 0) { $imageName = $j . ".jpg"; } else { $suffix = str_pad($j, 3, "0", STR_PAD_LEFT); $imageName = $suffix . ".jpg"; } $imageUrl = $baseImageUrl . $imageName; if ($j == 0) { $imageName = "000.jpg"; } $albumPath = ""; if (!file_exists($baseDir . $albumCode . "-" . $titleStr)) { dump($baseDir . $albumCode . "-" . $titleStr); mkdir($baseDir . $albumCode . "-" . $titleStr); } $albumPath = $baseDir . $albumCode . "-" . $titleStr; if (file_exists($albumPath . "/" . $imageName)) { continue; } $opts = array('http' => ['header' => "User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n Referer:" . $baseUrl . $album . "\r\n" ]); $curl_handle = curl_init(); curl_setopt($curl_handle, CURLOPT_URL, $imageUrl); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'); curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $album); $query = curl_exec($curl_handle); $i = 1; while ($query === false) { echo 'Curl error: ' . curl_error($curl_handle) . "\n"; echo "retry times: " . $i++ . " times \n"; sleep(1); $sleepTime = 1000 * random_int(1000, 10000); echo "retry sleep {$sleepTime} nano second \n"; usleep($sleepTime); $query = curl_exec($curl_handle); if ($i >= 100) { break; } } echo curl_error($curl_handle); $fp = fopen($albumPath . "/" . $imageName, 'x'); fwrite($fp, $query); fclose($fp); $sleepTime = 1000 * random_int(100, 1000); echo "after write image sleep {$sleepTime} nano second \n"; usleep($sleepTime); echo $imageUrl; } dump($page->all()); // exit; } // "https://img.onvshen.com:85/gallery/22162/31696/0.jpg" // exit; usleep(1000 * random_int(100, 1000)); } usleep(1000 * random_int(100, 1000)); // exit; } usleep(1000 * random_int(100, 1000)); } }