[ "path" => "XiuRen/", "dir" => "xiuren/" ], "xiaoyu" => [ "path" => "XiaoYu/", "dir" => "xiaoyu/" ], "youwu" => [ "path" => "YouWu/", "dir" => "youwu/" ], "mygirl" => [ "path" => "MyGirl/", "dir" => "mygirl/" ], "huayang" => [ "path" => "HuaYang/", "dir" => "huayang/" ], "mfstar" => [ "path" => "MFStar/", "dir" => "mfstar/" ], "imiss" => [ "path" => "IMiss/", "dir" => "imiss/" ] ]; public $domainUrl = "https://www.xiurenb.net/"; public $xiurenRootUrl = "https://www.xiurenb.net/XiuRen/"; // public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/"; // public $rootDir = "/Volumes/Backup/images/xiuren/"; public $rootDir = "/Volumes/Crucial X6/Image/xr/"; public $queryInstance; public $queryNew; public function __construct() { $this->queryInstance = QueryList::getInstance(); $this->queryNew = new QueryList(); } public function scrapeAll() { foreach ($this->name_dir as $key => $value) { # code... dump("current site: " . $key); $this->scrapeAlbum($key, 20); } // $this->scrapeAlbum("xiuren", 20); // $this->scrapeAlbum("xiaoyu", 20); // $this->scrapeAlbum("youwu", 20); // $this->scrapeAlbum("mygirl", 20); // $this->scrapeAlbum("huayang", 20); // $this->scrapeAlbum("mfstar", 20); // $this->scrapeAlbum("imiss", 20); } public function scrapeAlbum($path, $num = 20, $startPage = 0) { $pageSize = 20; $urlPath = $this->name_dir[$path]["path"]; $rootDir = $this->rootDir; $this->rootDir = $this->rootDir . $this->name_dir[$path]["dir"]; $pageCount = $this->getEncodeHtmlContent("https://www.xiurenb.net/$urlPath/index.html")->find(".page span strong")->htmls()->get(0); dump("current site item count: " . $pageCount); if ((int)$pageCount > 0) { $pageCount = min($pageCount, $num); for ($i = $startPage; $i <= ceil($pageCount / $pageSize); $i++) { $urlSuffix = ""; if ($i == 0) { $urlSuffix = "index.html"; } else { $urlSuffix = "index" . $i . ".html"; } $this->scrapePageAlbum($this->domainUrl . $urlPath . $urlSuffix); } } $this->rootDir = $rootDir; } public function scrapePageAlbum($url) { $pageContent = $this->getEncodeHtmlContent($url); $items = $pageContent->find(".i_list a")->getElements(); $i = 0; foreach ($items as $item) { // $i++; // if ($i < 18) { // continue; // # code... // } dump($item->getAttribute("href")); $this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href")); } } public function scrapeSingleAlbum($url) { Log::info("scrapeSingleAlbum $url"); $pageContent = $this->getEncodeHtmlContent($url); $albumName = $pageContent->find(".item_title h1")->htmls()->get(0); $pageItems = $pageContent->find(".content:eq(0) .page a")->attrs("href")->all(); if ($this->checkAlbumHasDownload($albumName)) { Log::info("已经下载过了,相册名:" . $albumName); return; } dump("当前相册名: " . $albumName); $imageNo = 1; $description = null; $this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo, $description); $pageItems = array_slice($pageItems, 2, count($pageItems) - 3); foreach ($pageItems as $item) { $pageContent = $this->getEncodeHtmlContent($this->domainUrl . $item); $this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo, $description); } } public function parseContent($dir, $pageContent, &$imageNo, &$description) { if (!is_dir($dir)) { try { mkdir($dir); } catch (Exception $e) { Log::error($e->getMessage()); return; } } $images = $pageContent->find(".content p img")->getElements(); $user = $pageContent->find(".item_info div a:eq(-1) span")->htmls()->get(0); if ($description == null) { $metas = $pageContent->find("meta")->getElements(); foreach ($metas as $meta) { $name = $meta->getAttribute("name"); if (trim($name) == "description") { $description = $meta->getAttribute("content"); break; } } dump("description: " . $description); } if ($imageNo == 1) { // 只在每个相册第一次输出名字 dump("user is " . $user); } foreach ($images as $image) { usleep(random_int(10, 100) * 100); $imageUrl = $image->getAttribute("src"); $trueImageUrl = "https://www.xiurenb.net" . $imageUrl; $fileInfo = pathinfo($trueImageUrl); if (file_exists($dir . DIRECTORY_SEPARATOR . $fileInfo["filename"] . ".jpg")) { rename($dir . DIRECTORY_SEPARATOR . $fileInfo["filename"] . ".jpg", $dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".jpg"); $imageNo++; continue; } if (file_exists($dir . DIRECTORY_SEPARATOR . $fileInfo["filename"] . ".webp")) { rename($dir . DIRECTORY_SEPARATOR . $fileInfo["filename"] . ".webp", $dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".webp"); $imageNo++; continue; } // 情况 2 缺少 user if (file_exists($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".jpg")) { rename($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".jpg", $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg"); $imageNo++; continue; } if (file_exists($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".webp")) { rename($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".webp", $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".webp"); $imageNo++; continue; } // 情况 3 缺少描述 if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg")) { if ($description != null) { rename($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg", $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg"); } $imageNo++; continue; } if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".webp")) { if ($description != null) { rename($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".webp", $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".webp"); } $imageNo++; continue; } // 情况 4 已存在则返回 if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg")) { # code... $imageNo++; continue; } if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"])) { # code... $imageNo++; continue; } // 情况 5 user 为空 if (file_exists($dir . DIRECTORY_SEPARATOR . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg")) { rename($dir . DIRECTORY_SEPARATOR . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg", $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg"); $imageNo++; continue; } if (file_exists($dir . DIRECTORY_SEPARATOR . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"])) { rename($dir . DIRECTORY_SEPARATOR . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"]); $imageNo++; continue; } // dump($fileInfo); $attempts = 0; $content = ""; do { try { $curl_handle = curl_init(); curl_setopt_array($curl_handle, array( CURLOPT_URL => $trueImageUrl, CURLOPT_RETURNTRANSFER => true, CURLOPT_ENCODING => '', CURLOPT_MAXREDIRS => 10, CURLOPT_TIMEOUT => 0, CURLOPT_FOLLOWLOCATION => true, CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, CURLOPT_CUSTOMREQUEST => 'GET', CURLOPT_HTTPHEADER => array( 'Connection: keep-alive', 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"', 'sec-ch-ua-mobile: ?0', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36', 'sec-ch-ua-platform: "macOS"', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8', 'Sec-Fetch-Site: same-origin', 'Sec-Fetch-Mode: no-cors', 'Sec-Fetch-Dest: image', 'Accept-Language: zh-CN,zh;q=0.9', 'Cookie: UM_distinctid=17e8fc4c12917-0742b5d542c2af-133a6253-13c680-17e8fc4c12a924; ASPSESSIONIDCWCCSCAC=EODDLCJCADBNDFDGCMALGMKO; CNZZDATA1278618868=1237248404-1643081659-%7C1646134190; ASPSESSIONIDCWCDTDAD=HAJGDPOCNBIKMMMNLCENPLAM' ), )); $content = curl_exec($curl_handle); if ($content === false) { $le = new Exception("get image has error: " . curl_error($curl_handle)); curl_close($curl_handle); throw $le; } curl_close($curl_handle); // $content = file_get_contents($trueImageUrl); } catch (ErrorException | Exception $e) { echo $e->getTraceAsString() . "\n"; $sleepTime = 10000 * random_int(100, 1000); echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n"; usleep($sleepTime); $attempts++; continue; } break; } while ($attempts < 100); if ($content != "") { dump("new file is " .$dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"]); file_put_contents($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"], $content); } else { Log::error("image content is empty " . $trueImageUrl); } $imageNo++; // dump("current imageNo: " . $imageNo); } } public function getEncodeHtmlContent($url) { $attempts = 0; $html = ""; $arrContextOptions = array( "ssl" => array( "allow_self_signed" => true, "verify_peer" => false, "verify_peer_name" => false, ), ); do { try { $curl = curl_init(); curl_setopt_array($curl, array( CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_ENCODING => 'UTF-8', CURLOPT_MAXREDIRS => 10, CURLOPT_TIMEOUT => 0, CURLOPT_FOLLOWLOCATION => true, CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, CURLOPT_CUSTOMREQUEST => 'GET', CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false, CURLOPT_HTTPHEADER => array( 'authority: www.xiurenji.net', 'pragma: no-cache', 'cache-control: no-cache', 'sec-ch-ua: " Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', 'sec-ch-ua-mobile: ?0', 'upgrade-insecure-requests: 1', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site: same-origin', 'sec-fetch-mode: navigate', 'sec-fetch-user: ?1', 'sec-fetch-dest: document', 'referer: https://www.xiurenji.net/XiuRen/', 'accept-language: zh-CN,zh;q=0.9', 'cookie: UM_distinctid=177fd93a0ca93c-06b94658d5d337-121a4759-13c680-177fd93a0cbcaf; ASPSESSIONIDCATDQACD=FDPMPCLAMHNCPJFCBLKFLCKH; CNZZDATA1278618868=367774893-1614867004-%7C1625926983; __51cke__=; __tins__20641871=%7B%22sid%22%3A%201625931982756%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201625933829110%7D; __51laig__=7' ), )); $response = curl_exec($curl); $error = curl_error($curl); echo $error; curl_close($curl); // echo $response; // $html = iconv('gb2312','UTF-8//IGNORE', $response); $html = iconv_gbk_to_uft8($response); $this->queryNew->setHtml($response); // $html = $response; } catch (Exception $e) { echo $e->getMessage() . "\n"; echo $e->getTraceAsString() . "\n"; $sleepTime = 10000 * random_int(100, 1000); echo "sleep {$sleepTime} nano second \n"; usleep($sleepTime); $attempts++; continue; } break; } while ($attempts < 100); // dump("current url: " . $url); return $this->queryInstance->setHtml($html); } private function checkAlbumHasDownload($albumName) { $record = ImageRecord::where("name", $albumName)->first(); if ($record != null) { return true; } } }