[ "dir" => "周韦彤", "code" => 16274, "name" => "周韦彤" ], "ycc" => [ "dir" => "ycc", "code" => 22162, "name" => "杨晨晨" ], "ry" => [ "dir" => "忍野さら", "code" => "21250", "name" => "忍野さら" ], "azu" => [ "dir" => "azu", "code" => 26002, "name" => "阿朱" ], "xq" => [ "dir" => "xq", "code" => 22204, "name" => "小琪" ], "ygh" => [ "dir" => "ygh", "code" => 15902, "name" => "原干惠" ], "wyc" => [ "dir" => "wyc", "code" => 19702, "name" => "王语纯" ], "zz" => [ "dir" => "zz", "code" => 22899, "name" => "芝芝 booty" ], "hlr" => [ "dir" => "hlr", "code" => 20015, "name" => "黄乐然" ], "jrq" => [ "dir" => "jrq", "code" => 26560, "name" => "姜仁卿" ], "ny" => [ "dir" => "ny", "code" => 26298, "name" => "奈月" ], "杉本有美" => [ "dir" => "杉本有美", "code" => 15939, "name" => "杉本有美" ], "糯美子" => [ "dir" => "糯美子", "code" => 19411, "name" => "糯美子" ] ]; public function scrapeNvshenGirls($isAll = false) { // 15902 原干惠 // 22162 杨晨晨 // 19702 王语纯 // 22899 芝芝 booty // 20015 黄乐然 // 26560 姜仁卿 $NUM_OF_ATTEMPTS = 50; $mainQl = QueryList::getInstance(); // self::$name_dir = array_slice(self::$name_dir, 1, 1); foreach (self::$name_dir as $username => $name) { // $baseDir = "/Users/shixuesen/Documents/tmp/image/xg/" . $name['dir'] . "/"; $baseDir = "/Volumes/intel660p/image/xg/" . $name["dir"] . "/"; Log::info("当前用户 " . $name["name"] . " from nvshen.org 站点"); $albumCodeMap = []; // 处理两个网站对应图册名不一致,直接取图册 id 作为唯一性判断的 map if (is_dir($baseDir)) { $albumDirList = scandir($baseDir); foreach ($albumDirList as $albumDir) { if ($albumDir == "." || $albumDir == ".." || $albumDir == ".DS_Store") { continue; } if (is_dir($baseDir . $albumDir ) && is_numeric(explode("-", $albumDir)[0])) { $albumCodeMap[explode("-", $albumDir)[0]] = $albumDir; } } // dump($albumCodeMap);exit; } $baseUrl = "https://www.fnvshen.com"; usleep(random_int(100, 1000) * 1000); $peopleUrl = "https://www.fnvshen.com/girl/"; $peopleUrl .= $name['code']; // 获取总的相册数量 $albumNumSelector = ".archive_more > a"; for ($i = 0; $i < 100; $i++) { try { $baseQl =$mainQl->get($peopleUrl,[], ['maxTry' => 5]); break; } catch (ConnectException $e) { echo 'connection error: ' . $e->getMessage() . "\n"; echo "retry times: " . $i . " times \n"; sleep(1); $sleepTime = 1000 * random_int(100, 1000); echo "retry sleep {$sleepTime} nano second \n"; usleep($sleepTime); } } $ql = $baseQl->find($albumNumSelector)->htmls(); $onlyOnePage = false; // 相册页数 if (count($ql->all()) == 0) { // 不超过 1 页时,不显示数量 $totalAlbumPage = 1; $onlyOnePage = true; } else { preg_match("#\d+#", $ql->all()[0], $result); dump($result); $totalAlbumNum = 0; if (is_numeric($result[0])) { $totalAlbumNum = $result[0]; } $totalAlbumPage = ceil($totalAlbumNum / 30); } $baseAlbumUrl = "https://www.fnvshen.com/girl/{$name['code']}/album/"; // 表示限制抓取相册数 $countLimit = 5; if ($isAll) { $countLimit = 5000; } for ($i = 0; $i <= $totalAlbumPage; $i++) { dump("current album page no: " . $i); if ($onlyOnePage) { $albumQl = $mainQl->get($peopleUrl); } else { $albumQl = $mainQl->get($baseAlbumUrl . $i . ".html"); } // dump($albumQl->getHtml()); $albumList = $albumQl->find(".igalleryli > .igalleryli_div > .igalleryli_link")->attrs("href"); $pageAlbum = $albumList->all(); // if ($i == 4) { // $pageAlbum = array_slice($pageAlbum, 1); // } // $pageAlbum = array_slice($pageAlbum, 19); // dump($pageAlbum);exit; foreach ($pageAlbum as $album) { if ($countLimit <= 0) { dump("相册已超过限制数量,跳出"); break 2; } usleep(1000 * random_int(100, 1000)); dump("相册:", [$album]); // $pageQL = $qL0->get($baseUrl . $album); for ($m = 0; $m < 100; $m++) { try { $page = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find(".albumInfo > span")->htmls(); $title = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find(".albumTitle > #htilte")->htmls(); $imageSource = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find("#hgallery img:nth-child(1)")->attr("src"); $imageSourceParts = pathinfo($imageSource, PATHINFO_DIRNAME); break; } catch (ConnectException $e) { echo 'connection error: ' . $e->getMessage() . "\n"; echo "retry times: " . $m . " times \n"; sleep(1); $sleepTime = 1000 * random_int(100, 1000); echo "retry sleep {$sleepTime} nano second \n"; usleep($sleepTime); } } dump($title->all()); $titleStr = $title->all()[0]; preg_match("#\d+#", $page->all()[0], $result); $totalImageNum = $result[0]; for ($j = 0; $j < $totalImageNum; $j++) { $albumCode = substr($album, 3, 5); // $baseImageUrl = "https://" . $imageSourceParts["host"] . ":" . $imageSourceParts["port"] . "/gallery/{$name['code']}/{$albumCode}/"; $baseImageUrl = $imageSourceParts . "/"; if ($j == 0) { $imageName = $j . ".jpg"; } else { $suffix = str_pad($j, 3, "0", STR_PAD_LEFT); $imageName = $suffix . ".jpg"; } $imageUrl = $baseImageUrl . $imageName; if ($j == 0) { $imageName = "000.jpg"; } $imageName = $username . "-" . $albumCode . "-" . $imageName; $albumPath = ""; if (!file_exists($baseDir . $albumCode . "-" . $titleStr) && !array_key_exists($albumCode, $albumCodeMap)) { dump($baseDir . $albumCode . "-" . $titleStr); mkdir($baseDir . $albumCode . "-" . $titleStr); } if (array_key_exists($albumCode, $albumCodeMap)) { $albumPath = $baseDir . $albumCodeMap[$albumCode]; } else { $albumPath = $baseDir . $albumCode . "-" . $titleStr; } if (file_exists($albumPath . "/" . $imageName)) { dump($albumPath . "/" . $imageName . " exists. skipped!"); continue; } $opts = array('http' => ['header' => "User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n Referer:" . $baseUrl . $album . "\r\n" ]); $curl_handle = curl_init(); curl_setopt($curl_handle, CURLOPT_URL, $imageUrl); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'); curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $album); $query = curl_exec($curl_handle); $i = 1; while ($query === false) { echo 'Curl error: ' . curl_error($curl_handle) . "\n"; echo "retry times: " . $i++ . " times \n"; sleep(1); $sleepTime = 1000 * random_int(100, 1000); echo "retry sleep {$sleepTime} nano second \n"; usleep($sleepTime); $query = curl_exec($curl_handle); if ($i >= 100) { break; } } echo curl_error($curl_handle); $fp = fopen($albumPath . "/" . $imageName, 'x'); fwrite($fp, $query); fclose($fp); $sleepTime = 1000 * random_int(100, 1000); echo "after write image sleep {$sleepTime} nano second \n"; usleep($sleepTime); echo $imageUrl; } dump($page->all()); // exit; $countLimit--; } // "https://img.onvshen.com:85/gallery/22162/31696/0.jpg" // exit; usleep(1000 * random_int(100, 1000)); } usleep(1000 * random_int(100, 1000)); // exit; } usleep(1000 * random_int(100, 1000)); } public function subTest() { $baseUrl = "https://www.fnvshen.com"; $album = "/g/31347/"; $pageQL = QueryList::get($baseUrl . $album); $page = $pageQL->find(".albumInfo > span")->htmls(); echo $page; $title = $pageQL->find(".albumTitle > #htilte")->htmls(); echo $title; } }