get("https://www.v2ph.com/album/ax86795a.html?page=2"); $client = new Client(['base_uri' => 'https://www.v2ph.com']); $cookieStr = "__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1"; $cookieArr = parseCookieHeader($cookieStr); $cookieJar = CookieJar::fromArray($cookieArr, ".v2ph.com"); // dump($cookieJar->getCookieByName("__cfduid")->getDomain());exit; $response = $client->request('GET', '/album/am4x838z.html?page=6', [ 'cookies' => $cookieJar, 'headers' => [ 'user-agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36', 'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'authority' => 'www.v2ph.com', 'referer' => 'https://www.v2ph.com/company/beautyleg', // 'cookie' => '__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1' ], // 'timeout' => 3.14, // 'proxy' => 'tcp://localhost:8125', // 'cert' => ['/path/server.pem', 'password'], ]); $body = $response->getBody(); $html = (string)$body; echo $html; $attrs = $this->queryInstance->html($html)->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src"); // foreach ($attrs as $attr) { // CommonService::downloadImage("/Users/shixuesen/Pictures/bl", $attr); // } // = $content->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src"); dump($attrs); } public $domainUrl = "https://www.v2ph.com"; public $albumRootUrl = "https://www.v2ph.com/company/beautyleg"; // public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/"; public $rootDir = "/Volumes/Backup/images/beautyleg/"; public $queryInstance; public function __construct() { $this->queryInstance = QueryList::getInstance(); } public function scrapeAlbum() { $pageSize = 16; $html = $this->curlContent("https://www.v2ph.com/company/beautyleg"); echo $html;exit; $pageCount = $this->queryInstance->html($html)->find("body > div.container.main-wrap > div.pt-2 > div.py-2.text-center > span")->htmls()->get(0); print_r($pageCount); if ((int)$pageCount > 0) { for ($i = 1; $i <= ceil($pageCount / $pageSize); $i++) { $urlSuffix = ""; if ($i == 1) { $urlSuffix = ""; } else { $urlSuffix = "?page=" . $i; } $this->scrapePageAlbum($this->albumRootUrl . $urlSuffix); exit; } } } public function scrapePageAlbum($url) { echo "scrapePageAlbum : ". $url . "\n"; $html = $this->curlContent($url); // echo $html; $pageContent = $this->queryInstance->html($html); dump($pageContent); $items = $pageContent->find(".media-cover")->getElements(); dump($items);exit; foreach ($items as $item) { dump($item->getAttribute("href")); // $this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href")); } } public function scrapeSingleAlbum($url) { echo "scrapeSingleAlbum : ". $url . "\n"; $html = $this->curlContent($url); $pageContent = $this->queryInstance->html($html); $pageSize = 3; // body > div > div.py-2 > nav > ul > li:nth-child(6) $items = $pageContent->find(".ina p:nth-child(2)")->texts(); $pageItems = $pageContent->find(".py-2 > nav > ul > li:eq(-1) a")->getElements(); // /album/z3x469oa.html?page=5 dump($pageItems[0]->attr("href"));exit; preg_match_all("#=(\d+)#", $pageItems[0]->attr("href"), $matchContent); dump($matchContent); $pageCount = (int)($matchContent[1]); if ($pageCount <= 0) { dump("this album is error: " . $url); Log::error("this album is error: " . $url); return; } $albumName = $pageContent->find("body > div > div.pt-2 > div > div > h1")->texts()[0]; $this->parseContent($this->rootDir . $albumName, $pageContent); dump("albumName: " . $albumName); // dump("item: ". $item); // exit; for ($i = 1; $i < $pageCount; $i++) { $pageContent = $this->curlContent($url . "?page=" . $i); $this->parseContent($this->rootDir . $albumName, $pageContent); } } public function parseContent($dir, $pageContent) { if (!is_dir($dir)) { try { mkdir($dir); } catch (Exception $e) { Log::error($e->getTraceAsString()); return; } } $attrs = $pageContent->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src"); foreach ($attrs as $attr) { CommonService::downloadImage($dir, $attr); } } public function getEncodeHtmlContent($url) { $attempts = 0; $html = ""; do { try { $html = iconv('gb2312', 'UTF-8//IGNORE', file_get_contents($url)); } catch (Exception $e) { echo $e->getTraceAsString() . "\n"; $sleepTime = 10000 * random_int(100, 1000); echo "sleep {$sleepTime} nano second \n"; usleep($sleepTime); $attempts++; continue; } break; } while ($attempts < 100); return $this->queryInstance->setHtml($html); } public function urlContent($url = "") { // dump(parse_url("https://www.v2ph.com/album/am4x838z.html?page=6"));exit; $urlParts = parse_url($url); $queryInstance = QueryList::getInstance(); $client = new Client(['base_uri' => 'https://www.v2ph.com']); $cookieStr = "__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1"; $cookieArr = parseCookieHeader($cookieStr); $cookieJar = CookieJar::fromArray($cookieArr, ".v2ph.com"); $response = $client->request('GET', !array_key_exists("query", $urlParts) ? $urlParts["path"] : $urlParts["path"] . "?" . $urlParts["query"], [ // $response = $client->request('GET', '/album/am4x838z.html?page=6', [ 'cookies' => $cookieJar, 'headers' => [ // 'user-agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36', // 'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', // 'authority' => 'www.v2ph.com', // 'referer' => 'https://www.v2ph.com/company/beautyleg', 'authority: www.v2ph.com', 'pragma: no-cache', 'cache-control: no-cache', 'sec-ch-ua: "Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"', 'sec-ch-ua-mobile: ?0', 'upgrade-insecure-requests: 1', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36', 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site: none', 'sec-fetch-mode: navigate', 'sec-fetch-user: ?1', 'sec-fetch-dest: document', 'accept-language: zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5', 'cookie: __cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _gid=GA1.2.1263326024.1610897479; __cf_bm=71e492fc72ab21fa0b8fad836744d6d4ee44ca16-1610983571-1800-AbpNw81XY8UC9/wkttHDeMEvoRLW9Ej47TMOzfg8UySIsXqAnw5o2BvBsa7ik49xS6LTQ1VtXHfvxN+N7uo7GOuwRq8CCsdTkB10/++FWTJqVSTf6HQtvc1/ftTQZi5nuw==; _ga_170M3FX3HZ=GS1.1.1610983570.3.1.1610983709.0; _ga=GA1.2.6396742.1610804846' // 'cookie' => '__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1' ], // 'timeout' => 3.14, // 'proxy' => 'tcp://localhost:8125', // 'cert' => ['/path/server.pem', 'password'], ]); $body = $response->getBody(); $html = (string)$body; return $html; echo $html; $attrs = $this->queryInstance->html($html)->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src"); // foreach ($attrs as $attr) { // CommonService::downloadImage("/Users/shixuesen/Pictures/bl", $attr); // } // = $content->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src"); dump($attrs); } public function curlContent($url = "") { $curl = curl_init(); curl_setopt_array($curl, array( CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_ENCODING => '', CURLOPT_MAXREDIRS => 10, CURLOPT_TIMEOUT => 0, CURLOPT_FOLLOWLOCATION => true, CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, CURLOPT_CUSTOMREQUEST => 'GET', CURLOPT_HTTPHEADER => array( 'authority: www.v2ph.com', 'pragma: no-cache', 'cache-control: no-cache', 'sec-ch-ua: "Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"', 'sec-ch-ua-mobile: ?0', 'upgrade-insecure-requests: 1', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36', 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site: none', 'sec-fetch-mode: navigate', 'sec-fetch-user: ?1', 'sec-fetch-dest: document', 'accept-language: zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5', 'cookie: __cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _gid=GA1.2.1263326024.1610897479; __cf_bm=71e492fc72ab21fa0b8fad836744d6d4ee44ca16-1610983571-1800-AbpNw81XY8UC9/wkttHDeMEvoRLW9Ej47TMOzfg8UySIsXqAnw5o2BvBsa7ik49xS6LTQ1VtXHfvxN+N7uo7GOuwRq8CCsdTkB10/++FWTJqVSTf6HQtvc1/ftTQZi5nuw==; _ga_170M3FX3HZ=GS1.1.1610983570.3.1.1610983709.0; _ga=GA1.2.6396742.1610804846' ), )); $response = curl_exec($curl); curl_close($curl); return $response; } }