queryInstance = QueryList::getInstance(); } /** * @return QueryList|null */ public function getQueryInstance(): ?QueryList { return $this->queryInstance; } private static $name_dir = [ "周韦彤" => [ "dir" => "周韦彤", "code" => 16274, "name" => "周韦彤" ], "ry" => [ "dir" => "忍野さら", "code" => "21250", "name" => "忍野さら" ], "ycc" => [ "dir" => "ycc", "code" => 22162, "name" => "杨晨晨" ], "azu" => [ "dir" => "azu", "code" => 26002, "name" => "阿朱" ], "xq" => [ "dir" => "xq", "code" => 22204, "name" => "小琪" ], "ygh" => [ "dir" => "ygh", "code" => 15902, "name" => "原干惠" ], "wyc" => [ "dir" => "wyc", "code" => 19702, "name" => "王语纯" ], "zz" => [ "dir" => "zz", "code" => 22899, "name" => "芝芝 booty" ], "hlr" => [ "dir" => "hlr", "code" => 20015, "name" => "黄乐然" ], "jrq" => [ "dir" => "jrq", "code" => 26560, "name" => "姜仁卿" ], "ny" => [ "dir" => "ny", "code" => 26298, "name" => "奈月" ], "杉本有美" => [ "dir" => "杉本有美", "code" => 15939, "name" => "杉本有美" ] ]; public function scrapeXiuGirls() { // 15902 原干惠 // 22162 杨晨晨 // 19702 王语纯 // 22899 芝芝 booty // 20015 黄乐然 // 26560 姜仁卿 $NUM_OF_ATTEMPTS = 50; // self::$name_dir = array_slice(self::$name_dir, 8, 10); foreach (self::$name_dir as $username => $name) { $items = $this->queryAllAlbum($name); // $items = array_slice($items, 49); // print_r($items);exit; $baseUrl = "https://www.xsnvshen.com"; // $baseDir = "/Users/shixuesen/Documents/xg/" . $name["dir"]. "/"; $baseDir = "/Volumes/intel660p/image/xg/" . $name["dir"] . "/"; if (!file_exists($baseDir)) { mkdir($baseDir); } // 扫描目录下已有的相册目录,以相册标号作为 key,目录作为值返回 $baseMap = FileUtils::scanBaseDir($baseDir); // $items = array_slice($items, 1); // print_r($items);continue; foreach ($items as $item) { CommonUtils::randomSleep(100); echo "相册子链接: " .$item . "\n"; $title = $this->getAlbumTitle($baseUrl, $item); // break; $albumPath = ""; $albumCode = explode("/", $item)[2]; $albumPath = $this->generateAlbumPath($albumPath, $albumCode, $baseMap, $baseDir, $item, $title[0]); $attempts = 0; $images = $this->queryImages($baseUrl, $item, $attempts, $NUM_OF_ATTEMPTS); foreach ($images as $image) { if (strpos($image, "thumb_600x900") !== false) { $image = str_replace( "thumb_600x900/", "", $image); } /** * todo 修改 * CommonService::downloadImage($albumPath, "https:" . $image); */ if (!$this->checkIfNeedDownload($image, $username, $albumCode, $name, $albumPath)){ continue; } $opts = array("http"=> ["header" => "User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 \r\n Referer:".$baseUrl.$item."\r\n" ]); $this->processDownloadImage($image, $baseUrl, $item, $albumPath, $username, $albumCode); } usleep(1000 * random_int(100, 1000)); } } } public function singleAlbum($iUser = "ycc", $albumOuterCode = 21429) { $NUM_OF_ATTEMPTS = 50; // self::$name_dir[1]; print_r([self::$name_dir[$iUser]]); foreach ([self::$name_dir[$iUser]] as $username => $name) { $username = $iUser; dump("here {$username}, {$name["code"]}"); usleep(random_int(1000, 10000) * 1000); $peopleUrl = "https://xsnvshen.com/girl/"; $peopleUrl .= $name["code"]; $albumSelector = ".entryAblum > .star-mod-bd > ul > li > a"; $baseQl = QueryList::get($peopleUrl); $ql = $baseQl->find($albumSelector)->attrs("href"); $items = $ql->all(); // $items = array_slice($items, 49); // print_r($items);exit; $baseUrl = "https://www.xsnvshen.com"; // $baseDir = "/Users/shixuesen/Documents/xg/" . $name["dir"]. "/"; $baseDir = "/Volumes/intel660p/image/xg/" . $name["dir"] . "/"; if (!file_exists($baseDir)) { mkdir($baseDir); } // $items = array_slice($items, 1); print_r($items); $items = ["/album/21429"]; foreach ($items as $item) { print_r($item); if ($item != "/album/" . $albumOuterCode) { echo $item . "\t" . "/album/" . $albumOuterCode . "\n"; echo "000000000000000000"; continue; } usleep(random_int(1000, 10000) * 1000); echo "相册子链接: " .$item . "\n"; $queryItemUrlTimes = 1; do { try { $html = (new QueryList)->get($baseUrl . $item); } catch (Exception $e) { Log::error("查询相册子链接失败,将重试, 异常信息: " . $e->getMessage()); $sleepTime = 1000 * random_int(1000, 10000); echo "查询相册子链接失败 sleep {$sleepTime} nano second \n"; usleep($sleepTime); $queryItemUrlTimes++; } break; } while ($queryItemUrlTimes < 10); $title = $html->find("h1 > a")->texts(); dump($title); echo "相册名: " . $title[0] . "\n"; while (trim($title[0]) == "古诗文") { echo "here error happenned \n"; // dump($html->getHtml()); usleep(random_int(1, 1000) * 50000); $html = QueryList::get($baseUrl. $item); $title = $html->find("h1 > a")->texts(); // continue; } // break; $albumPath = ""; $albumCode = explode("/", $item)[2]; if (!file_exists($baseDir .explode("/", $item)[2] . "-" .$title[0])) { mkdir($baseDir .explode("/", $item)[2]. "-" .$title[0]); } $albumPath = $baseDir.explode("/", $item)[2] . "-" . $title[0]; $attempts = 0; $images = []; do { try { $images = QueryList::get($baseUrl . $item)->find(".swi-hd > img")->attrs("src"); } catch (Exception $e) { echo $e->getTraceAsString() . "\n"; $sleepTime = 10000 * random_int(100, 1000); echo "sleep {$sleepTime} nano second \n"; usleep($sleepTime); $attempts ++; continue; } break; } while ($attempts <= $NUM_OF_ATTEMPTS); foreach ($images as $image) { if (strpos($image, "thumb_600x900") !== false) { $image = str_replace( "thumb_600x900/", "", $image); } /** * todo 修改 * CommonService::downloadImage($albumPath, "https:" . $image); */ if (file_exists($albumPath."/".pathinfo("http:".$image)["filename"].".jpg") || file_exists($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)["filename"].".jpg")) { continue; } $opts = array("http"=> ["header" => "User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n Referer:".$baseUrl.$item."\r\n" ]); $curl_handle=curl_init(); curl_setopt($curl_handle, CURLOPT_URL,"https:".$image); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl_handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"); curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item); $query = curl_exec($curl_handle); $i = 1; while ($query === false) { echo "Curl error: " . curl_error($curl_handle) ."\n"; echo "retry times: " .$i++ ." times \n"; sleep(1); $sleepTime = 1000 * random_int(1000, 10000); echo "retry sleep {$sleepTime} nano second \n"; usleep($sleepTime); $query = curl_exec($curl_handle); if ($i >= 100) { break; } } echo curl_error($curl_handle); echo $query;exit; exit; $fp = fopen($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)["filename"].".jpg", "x"); fwrite($fp, $query); fclose($fp); $sleepTime = 1000 * random_int(100, 1000); echo "after write image sleep {$sleepTime} nano second \n"; usleep($sleepTime); } usleep(1000 * random_int(100, 1000)); } } exit; } /** * xiugirl网站个人页展示的是全部的相册,不用分页查 * @param $name * @param bool $isAll 是否抓取全部的,默认否,只抓前 5 个相册 * @return array */ private function queryAllAlbum($name, $isAll = false): array { CommonUtils::randomSleep(100); $peopleUrl = "https://xsnvshen.com/girl/"; $peopleUrl .= $name["code"]; $albumSelector = ".entryAblum > .star-mod-bd > ul > li > a"; $baseQl = $this->getQueryInstance()->get($peopleUrl); $ql = $baseQl->find($albumSelector)->attrs("href"); $items = $ql->all(); if (!$isAll) { $items = array_slice($items, 0, 5); } return $items; } /** * 获取相册名 * @param string $baseUrl * @param $item * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection * @throws Exception */ private function getAlbumTitle(string $baseUrl, $item) { $queryItemUrlTimes = 1; do { try { $html = $this->getQueryInstance()->get($baseUrl . $item); } catch (Exception $e) { Log::error("查询相册子链接失败,将重试, 异常信息: " . $e->getMessage()); $sleepTime = 1000 * random_int(1000, 10000); echo "查询相册子链接失败 sleep {$sleepTime} nano second \n"; usleep($sleepTime); $queryItemUrlTimes++; continue; } break; } while ($queryItemUrlTimes < 10); $title = $html->find("h1 > a")->texts(); dump($title); echo "相册名: " . $title[0] . "\n"; while (trim($title[0]) == "古诗文") { echo "here error happenned \n"; // dump($html->getHtml()); usleep(random_int(1, 1000) * 50000); $html = $this->getQueryInstance()->get($baseUrl . $item); $title = $html->find("h1 > a")->texts(); // continue; } return $title; } /** * @param string $albumPath * @param string $albumCode * @param array $baseMap * @param string $baseDir * @param $item * @param $title * @return mixed|string */ private function generateAlbumPath(string $albumPath, string $albumCode, array $baseMap, string $baseDir, $item, $title): string { if (array_key_exists($albumCode, $baseMap)) { $albumPath = $baseMap[$albumCode]; } if ($albumPath == "" && !file_exists($baseDir . explode("/", $item)[2] . "-" . $title)) { $albumPath = $baseDir . explode("/", $item)[2] . "-" . $title; mkdir($albumPath); } return $albumPath; } /** * 获取相册的所有图片链接 * @param string $baseUrl * @param $item * @param int $attempts * @param int $NUM_OF_ATTEMPTS * @return array * @throws Exception */ private function queryImages(string $baseUrl, $item, int $attempts, int $NUM_OF_ATTEMPTS): array { $images = []; do { try { $images = $this->getQueryInstance()->get($baseUrl . $item)->find(".swi-hd > img")->attrs("src"); } catch (Exception $e) { echo $e->getTraceAsString() . "\n"; $sleepTime = 10000 * random_int(100, 1000); echo "sleep {$sleepTime} nano second \n"; usleep($sleepTime); $attempts++; continue; } break; } while ($attempts <= $NUM_OF_ATTEMPTS); return $images->all(); } /** * @param array $image * @param string $username * @param string $albumCode * @param $name * @param string $albumPath * @return bool */ private function checkIfNeedDownload(string $image, string $username, string $albumCode, $name, string $albumPath): bool { $fileInfo = pathinfo("http:" . $image); $imageFileName = $username . "-" . $albumCode . "-" . $fileInfo["filename"] . ".jpg"; $images = XiuGirl::where([ "user_id" => $name["code"], "album_id" => $albumCode, "image_name" => $imageFileName ])->get(); if ($images->count() > 0) { if ($this->imageFileExists($albumPath, $image, $username, $albumCode)) { return false; } else { return true; } } else { if ($this->imageFileExists($albumPath, $image, $username, $albumCode)) { $xiuGirl = new XiuGirl; $xiuGirl->user_id = $name["code"]; $xiuGirl->album_id = $albumCode; $xiuGirl->image_name = $username . "-" . $albumCode . "-" . $fileInfo["filename"] . ".jpg"; $xiuGirl->site_from = "xiugirl"; $xiuGirl->is_downloaded = 1; $xiuGirl->save(); return false; } else { return true; } } } /** * @param string $albumPath * @param array $image * @param string $username * @param string $albumCode * @return bool */ private function imageFileExists(string $albumPath, string $image, string $username, string $albumCode): bool { return file_exists($albumPath . "/" . pathinfo("http:" . $image)["filename"] . ".jpg") || file_exists($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:" . $image)["filename"] . ".jpg"); } /** * @param array $image * @param string $baseUrl * @param $item * @param string $albumPath * @param string $username * @param string $albumCode * @throws Exception */ private function processDownloadImage(string $image, string $baseUrl, $item, string $albumPath, string $username, string $albumCode): void { // CommonService::downloadImage($albumPath, "https:" . $image);exit; $curl_handle = curl_init(); // curl_setopt($curl_handle, CURLOPT_URL, "https:" . $image); // curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 20000); // curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); // curl_setopt($curl_handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"); // curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item); echo "image url is " . $image . "\n"; curl_setopt_array($curl_handle, array( CURLOPT_URL => "https:" . $image, CURLOPT_RETURNTRANSFER => true, CURLOPT_ENCODING => '', CURLOPT_MAXREDIRS => 10, CURLOPT_TIMEOUT => 0, CURLOPT_FOLLOWLOCATION => true, CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, CURLOPT_CUSTOMREQUEST => 'GET', CURLOPT_HTTPHEADER => array( 'authority: img.xsnvshen.com', 'pragma: no-cache', 'cache-control: no-cache', 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"', 'sec-ch-ua-mobile: ?0', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8', 'sec-fetch-site: same-site', 'sec-fetch-mode: no-cors', 'sec-fetch-dest: image', 'referer: https://www.xsnvshen.com/album/' . $item, 'accept-language: zh-CN,zh;q=0.9', // 'cookie: __cfduid=dac5872f65e79a40a5b30229ba97beb6a1619333692' ), )); $content = curl_exec($curl_handle); $i = 1; while ($content === false) { echo "Curl error: " . curl_error($curl_handle) . "\n"; echo "retry times: " . $i++ . " times \n"; sleep(1); $sleepTime = 1000 * random_int(1000, 10000); echo "retry sleep {$sleepTime} nano second \n"; usleep($sleepTime); $content = curl_exec($curl_handle); if ($i >= 100) { break; } } // $content = file_get_contents("https:" . $image); // echo curl_error($curl_handle); // echo $content;exit; $fp = fopen($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:" . $image)["filename"] . ".jpg", "x"); fwrite($fp, $content); fclose($fp); $sleepTime = 1000 * random_int(100, 1000); echo "after write image sleep {$sleepTime} nano second \n"; usleep($sleepTime); } }