queryInstance = QueryList::getInstance(); $this->queryNew = new QueryList(); } public function scrapeAlbum() { $pageSize = 20; $pageCount = $this->getEncodeHtmlContent("https://www.xiurenb.net/YouWu/index.html")->find(".page span strong")->htmls()->get(0); print_r($pageCount); if ((int)$pageCount > 0) { // $pageCount = 20; for ($i = 0; $i <= ceil($pageCount / $pageSize); $i++) { $urlSuffix = ""; if ($i == 0) { $urlSuffix = "index.html"; } else { $urlSuffix = "index" . $i . ".html"; } $this->scrapePageAlbum($this->xiurenRootUrl . $urlSuffix); } } } public function scrapePageAlbum($url) { $pageContent = $this->getEncodeHtmlContent($url); $items = $pageContent->find(".i_list a")->getElements(); $i = 0; foreach ($items as $item) { // $i++; // if ($i < 18) { // continue; // # code... // } dump($item->getAttribute("href")); $this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href")); } } public function scrapeSingleAlbum($url) { Log::info("scrapeSingleAlbum $url"); $pageContent = $this->getEncodeHtmlContent($url); $albumName = $pageContent->find(".item_title h1")->htmls()->get(0); $pageItems = $pageContent->find(".content:eq(0) .page a")->attrs("href")->all(); if ($this->checkAlbumHasDownload($albumName)) { Log::info("已经下载过了,相册名:" . $albumName); return; } $imageNo = 1; $this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo); $pageItems = array_slice($pageItems, 2, count($pageItems) - 3); foreach ($pageItems as $item) { $pageContent = $this->getEncodeHtmlContent($this->domainUrl . $item); $this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo); } } public function parseContent($dir, $pageContent, &$imageNo) { if (!is_dir($dir)) { try { mkdir($dir); } catch (Exception $e) { Log::error($e->getMessage()); return; } } $images = $pageContent->find(".content p img")->getElements(); $user = $pageContent->find(".item_info div a:eq(-1) span")->htmls()->get(0); dump("user is " . $user); foreach ($images as $image) { usleep(random_int(10, 100) * 100); $imageUrl = $image->getAttribute("src"); $trueImageUrl = "https://www.xiurenji.net" . $imageUrl; $fileInfo = pathinfo($trueImageUrl); if (file_exists($dir . DIRECTORY_SEPARATOR . $fileInfo["basename"])) { rename($dir . DIRECTORY_SEPARATOR . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"]); $imageNo++; continue; } if (file_exists($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"])) { rename($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"]); $imageNo++; continue; } if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"])) { $imageNo++; continue; } dump($fileInfo); $attempts = 0; $content = ""; do { try { $curl_handle = curl_init(); curl_setopt($curl_handle, CURLOPT_URL, $trueImageUrl); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 20000); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'); curl_setopt($curl_handle, CURLOPT_REFERER, $this->xiurenRootUrl); curl_setopt($curl_handle, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl_handle, CURLOPT_ENCODING, ''); curl_setopt($curl_handle, CURLOPT_MAXREDIRS, 10); curl_setopt($curl_handle, CURLOPT_TIMEOUT, 0); curl_setopt($curl_handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); curl_setopt($curl_handle, CURLOPT_CUSTOMREQUEST, 'GET'); curl_setopt($curl_handle, CURLOPT_HTTPHEADER, array( 'authority: www.xiurenji.net', 'pragma: no-cache', 'cache-control: no-cache', 'sec-ch-ua: "Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"', 'sec-ch-ua-mobile: ?0', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', 'sec-ch-ua-platform: "macOS"', 'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8', 'sec-fetch-site: same-origin', 'sec-fetch-mode: no-cors', 'sec-fetch-dest: image', 'referer: https://www.xiurenji.net/XiuRen/9483.html', 'accept-language: zh-CN,zh;q=0.9', 'cookie: UM_distinctid=17cfa8bea8eb9e-0dd0c6d032d0fc-1c306851-13c680-17cfa8bea8fc85; CNZZDATA1278618868=1505121253-1636283360-%7C1636283360; __51cke__=; ASPSESSIONIDQAQAATSQ=LBLGNPMDHKKMNOPDBCEAPIMH; __tins__20641871=%7B%22sid%22%3A%201636291046220%2C%20%22vd%22%3A%202%2C%20%22expires%22%3A%201636292852634%7D; __51laig__=2' )); $content = curl_exec($curl_handle); if ($content === false) { $le = new Exception("get image has error: " . curl_error($curl_handle)); curl_close($curl_handle); throw $le; } curl_close($curl_handle); // $content = file_get_contents($trueImageUrl); } catch (ErrorException|Exception $e) { echo $e->getTraceAsString() . "\n"; $sleepTime = 10000 * random_int(100, 1000); echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n"; usleep($sleepTime); $attempts++; continue; } break; } while ($attempts < 100); if ($content != "") { file_put_contents($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"], $content); } else { Log::error("image content is empty " . $trueImageUrl); } $imageNo++; // dump("current imageNo: " . $imageNo); } } public function getEncodeHtmlContent($url) { $attempts = 0; $html = ""; $arrContextOptions = array( "ssl" => array( "allow_self_signed" => true, "verify_peer" => false, "verify_peer_name" => false, ), ); do { try { $curl = curl_init(); curl_setopt_array($curl, array( CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_ENCODING => 'UTF-8', CURLOPT_MAXREDIRS => 10, CURLOPT_TIMEOUT => 0, CURLOPT_FOLLOWLOCATION => true, CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, CURLOPT_CUSTOMREQUEST => 'GET', CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false, CURLOPT_HTTPHEADER => array( 'authority: www.xiurenji.net', 'pragma: no-cache', 'cache-control: no-cache', 'sec-ch-ua: " Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', 'sec-ch-ua-mobile: ?0', 'upgrade-insecure-requests: 1', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site: same-origin', 'sec-fetch-mode: navigate', 'sec-fetch-user: ?1', 'sec-fetch-dest: document', 'referer: https://www.xiurenji.net/XiuRen/', 'accept-language: zh-CN,zh;q=0.9', 'cookie: UM_distinctid=177fd93a0ca93c-06b94658d5d337-121a4759-13c680-177fd93a0cbcaf; ASPSESSIONIDCATDQACD=FDPMPCLAMHNCPJFCBLKFLCKH; CNZZDATA1278618868=367774893-1614867004-%7C1625926983; __51cke__=; __tins__20641871=%7B%22sid%22%3A%201625931982756%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201625933829110%7D; __51laig__=7' ), )); $response = curl_exec($curl); $error = curl_error($curl); echo $error; curl_close($curl); // echo $response; // $html = iconv('gb2312','UTF-8//IGNORE', $response); $html = iconv_gbk_to_uft8($response); $this->queryNew->setHtml($response); // $html = $response; } catch (Exception $e) { echo $e->getMessage() . "\n"; echo $e->getTraceAsString() . "\n"; $sleepTime = 10000 * random_int(100, 1000); echo "sleep {$sleepTime} nano second \n"; usleep($sleepTime); $attempts++; continue; } break; } while ($attempts < 100); dump("current url: " . $url); return $this->queryInstance->setHtml($html); } private function checkAlbumHasDownload($albumName) { $record = ImageRecord::where("name", $albumName)->first(); if ($record != null) { return true; } } }