queryInstance = QueryList::getInstance(); } public function scrapeAlbum() { echo "111"; $pageSize = 20; $pageCount = $this->getEncodeHtmlContent("https://www.xiurenji.vip/XiuRen/index.html")->find(".page span")->htmls()->get(0); print_r($pageCount); if ((int)$pageCount > 0) { $pageCount = 40; for ($i = 0; $i <= ceil($pageCount / $pageSize); $i++) { $urlSuffix = ""; if ($i == 0) { $urlSuffix = "index.html"; } else { $urlSuffix = "index" . $i . ".html"; } $this->scrapePageAlbum($this->xiurenRootUrl . $urlSuffix); // exit; } } } public function scrapePageAlbum($url) { $pageContent = $this->getEncodeHtmlContent($url); // dump($pageContent); $items = $pageContent->find(".dan a")->getElements(); foreach ($items as $item) { dump($item->getAttribute("href")); $this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href")); } } public function scrapeSingleAlbum($url) { Log::info("scrapeSingleAlbum $url"); $pageContent = $this->getEncodeHtmlContent($url); $pageSize = 3; $items = $pageContent->find(".ina p:nth-child(2)")->texts(); $pageItems = $pageContent->find(".page a:eq(-2)")->htmls(); if (count($pageItems) <= 0) { dump("this album is error: ". $url); Log::error("this album is error: " . $url); return; } // dump($pageItems);exit; foreach ($items as $item) { // $imageNum = substr($item, strrpos($item, "[") + 1, strrpos($item, "P]") - strrpos($item, "[") - 1); // $pageCount = ceil($imageNum / $pageSize); $pageCount = (int)$pageItems[0]; // dump("pageCount: ". $pageCount . "");exit; $slashPos = strpos($url, "XiuRen/") + 7; $dotPos = strrpos($url, "."); $albumCode = substr($url, $slashPos, $dotPos - $slashPos); // $albumName0 = substr($item, strpos($item, "["), strrpos($item, "]") - strpos($item, "[") + 1); $albumName = ltrim(substr($item, 6, strrpos($item, "]") - 5)); if (mb_strlen($albumName) <= 12 || mb_strlen($albumName) >= 50 || !str_contains($albumName, "No")) { dump("old Album: " . $albumName); $albumName = $pageContent->find(".ina p b:nth-child(2)")->texts(); // $albumName = ltrim(substr($item, 6, strrpos(substr($item, 0, strrpos($item, "@")), " ") - 5)); dump("new Album: " . urldecode($albumName[0])); $albumName = urldecode($albumName[0]); } $imageNo = 1; $this->parseContent($this->rootDir. $albumName, $pageContent, $imageNo); dump("albumName: ". $albumName); // dump("item: ". $item); // exit; for ($i = 1; $i < $pageCount; $i++) { $pageContent = $this->getEncodeHtmlContent($this->xiurenRootUrl . $albumCode . "_" . $i . ".html"); $this->parseContent($this->rootDir. $albumName, $pageContent, $imageNo); } // dump("slashPos: " . $slashPos . " dotPos: " . $dotPos . " albumCode: ". $albumCode); // dump($item); // exit; } } public function parseContent($dir, $pageContent, &$imageNo) { if (!is_dir($dir)) { try { mkdir($dir); } catch (Exception $e) { Log::error($e->getMessage()); return; } } $images = $pageContent->find(".img p img")->getElements(); // dump($images);exit; foreach ($images as $image) { usleep(random_int(10, 100) * 100); $imageUrl = $image->getAttribute("src"); $trueImageUrl = "https://x1.plmn5.com/U". substr($imageUrl, 2); $trueImageUrl = "https://www.xiurenji.vip" . $imageUrl; $fileInfo = pathinfo($trueImageUrl); if (file_exists($dir . "/" .$fileInfo["basename"])) { rename($dir . "/" . $fileInfo["basename"], $dir . "/" . $imageNo . "-" . $fileInfo["basename"]); $imageNo++; continue; } if (file_exists($dir . "/" . $imageNo . "-" . $fileInfo["basename"])) { $imageNo++; continue; } dump($fileInfo); $attempts = 0; $content = ""; do { try { $curl_handle=curl_init(); curl_setopt($curl_handle, CURLOPT_URL, $trueImageUrl); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'); curl_setopt($curl_handle, CURLOPT_REFERER, $this->xiurenRootUrl); curl_setopt($curl_handle, CURLOPT_FOLLOWLOCATION, true); $content = curl_exec($curl_handle); if ($content === false) { $le = new Exception("get image has error: " . curl_error($curl_handle)); curl_close($curl_handle); throw $le; } curl_close($curl_handle); // $content = file_get_contents($trueImageUrl); } catch (ErrorException | Exception $e) { echo $e->getTraceAsString() . "\n"; $sleepTime = 10000 * random_int(100, 1000); echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n"; usleep($sleepTime); $attempts ++; continue; } break; } while($attempts < 100); if ($content != "") { file_put_contents($dir . "/" . $imageNo . "-" . $fileInfo["basename"], $content); } else { Log::error("image content is empty ". $trueImageUrl); } $imageNo++; // dump("current imageNo: " . $imageNo); } } public function getEncodeHtmlContent($url) { $attempts = 0; $html = ""; $arrContextOptions=array( "ssl"=>array( "allow_self_signed"=>true, "verify_peer"=>false, "verify_peer_name"=>false, ), ); do { try { $curl = curl_init(); curl_setopt_array($curl, array( CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_ENCODING => '', CURLOPT_MAXREDIRS => 10, CURLOPT_TIMEOUT => 0, CURLOPT_FOLLOWLOCATION => true, CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, CURLOPT_CUSTOMREQUEST => 'GET', CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false, CURLOPT_HTTPHEADER => array( 'authority: www.xiurenji.vip', 'pragma: no-cache', 'cache-control: no-cache', 'sec-ch-ua: " Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', 'sec-ch-ua-mobile: ?0', 'upgrade-insecure-requests: 1', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site: same-origin', 'sec-fetch-mode: navigate', 'sec-fetch-user: ?1', 'sec-fetch-dest: document', 'referer: https://www.xiurenji.vip/XiuRen/', 'accept-language: zh-CN,zh;q=0.9', 'cookie: UM_distinctid=177fd93a0ca93c-06b94658d5d337-121a4759-13c680-177fd93a0cbcaf; ASPSESSIONIDCATDQACD=FDPMPCLAMHNCPJFCBLKFLCKH; CNZZDATA1278618868=367774893-1614867004-%7C1625926983; __51cke__=; __tins__20641871=%7B%22sid%22%3A%201625931982756%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201625933829110%7D; __51laig__=7' ), )); $response = curl_exec($curl); $error = curl_error($curl); echo $error; curl_close($curl); // echo $response; $html = iconv('gb2312','UTF-8//IGNORE', $response); } catch (Exception $e) { echo $e->getMessage(). "\n"; echo $e->getTraceAsString() . "\n"; $sleepTime = 10000 * random_int(100, 1000); echo "sleep {$sleepTime} nano second \n"; usleep($sleepTime); $attempts ++; continue; } break; } while($attempts < 100); return $this->queryInstance->setHtml($html); } }