|
|
<?php
|
|
|
|
|
|
namespace App\Services;
|
|
|
|
|
|
use GuzzleHttp\Client;
|
|
|
use GuzzleHttp\Cookie\CookieJar;
|
|
|
use GuzzleHttp\Cookie\SetCookie;
|
|
|
use Illuminate\Support\Facades\Log;
|
|
|
use QL\QueryList;
|
|
|
use function Zend\Diactoros\parseCookieHeader;
|
|
|
|
|
|
class CommonScrapeService
|
|
|
{
|
|
|
|
|
|
public function scrapeBLImage()
|
|
|
{
|
|
|
$sourceUrl = "http://img11.tu11.com:8080/uploads/20201125/202011251846611";
|
|
|
for ($i = 39; $i < 100; $i++) {
|
|
|
$content = file_get_contents($sourceUrl . $i . ".jpg");
|
|
|
file_put_contents("/Users/shixuesen/Pictures/bl/" . pathinfo($sourceUrl . $i . ".jpg", PATHINFO_BASENAME), $content);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
public function scrapeBeautyLegImage()
|
|
|
{
|
|
|
$queryInstance = QueryList::getInstance();
|
|
|
// $content = $queryInstance->get("https://www.v2ph.com/album/ax86795a.html?page=2");
|
|
|
$client = new Client(['base_uri' => 'https://www.v2ph.com']);
|
|
|
|
|
|
$cookieStr = "__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1";
|
|
|
$cookieArr = parseCookieHeader($cookieStr);
|
|
|
$cookieJar = CookieJar::fromArray($cookieArr, ".v2ph.com");
|
|
|
// dump($cookieJar->getCookieByName("__cfduid")->getDomain());exit;
|
|
|
|
|
|
|
|
|
$response = $client->request('GET', '/album/am4x838z.html?page=6', [
|
|
|
'cookies' => $cookieJar,
|
|
|
'headers' => [
|
|
|
'user-agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
|
|
|
'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
|
'authority' => 'www.v2ph.com',
|
|
|
'referer' => 'https://www.v2ph.com/company/beautyleg',
|
|
|
// 'cookie' => '__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1'
|
|
|
],
|
|
|
// 'timeout' => 3.14,
|
|
|
// 'proxy' => 'tcp://localhost:8125',
|
|
|
// 'cert' => ['/path/server.pem', 'password'],
|
|
|
]);
|
|
|
|
|
|
$body = $response->getBody();
|
|
|
$html = (string)$body;
|
|
|
echo $html;
|
|
|
$attrs = $this->queryInstance->html($html)->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src");
|
|
|
// foreach ($attrs as $attr) {
|
|
|
// CommonService::downloadImage("/Users/shixuesen/Pictures/bl", $attr);
|
|
|
// }
|
|
|
// = $content->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src");
|
|
|
dump($attrs);
|
|
|
}
|
|
|
|
|
|
public $domainUrl = "https://www.v2ph.com";
|
|
|
public $albumRootUrl = "https://www.v2ph.com/company/beautyleg";
|
|
|
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
|
|
|
public $rootDir = "/Volumes/Backup/images/beautyleg/";
|
|
|
public $queryInstance;
|
|
|
|
|
|
public function __construct()
|
|
|
{
|
|
|
$this->queryInstance = QueryList::getInstance();
|
|
|
}
|
|
|
|
|
|
public function scrapeAlbum()
|
|
|
{
|
|
|
|
|
|
$pageSize = 16;
|
|
|
$html = $this->curlContent("https://www.v2ph.com/company/beautyleg");
|
|
|
echo $html;exit;
|
|
|
$pageCount = $this->queryInstance->html($html)->find("body > div.container.main-wrap > div.pt-2 > div.py-2.text-center > span")->htmls()->get(0);
|
|
|
print_r($pageCount);
|
|
|
if ((int)$pageCount > 0) {
|
|
|
for ($i = 1; $i <= ceil($pageCount / $pageSize); $i++) {
|
|
|
$urlSuffix = "";
|
|
|
if ($i == 1) {
|
|
|
$urlSuffix = "";
|
|
|
} else {
|
|
|
$urlSuffix = "?page=" . $i;
|
|
|
}
|
|
|
$this->scrapePageAlbum($this->albumRootUrl . $urlSuffix);
|
|
|
exit;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
public function scrapePageAlbum($url)
|
|
|
{
|
|
|
echo "scrapePageAlbum : ". $url . "\n";
|
|
|
$html = $this->curlContent($url);
|
|
|
// echo $html;
|
|
|
$pageContent = $this->queryInstance->html($html);
|
|
|
dump($pageContent);
|
|
|
$items = $pageContent->find(".media-cover")->getElements();
|
|
|
dump($items);exit;
|
|
|
foreach ($items as $item) {
|
|
|
dump($item->getAttribute("href"));
|
|
|
// $this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href"));
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
public function scrapeSingleAlbum($url)
|
|
|
{
|
|
|
echo "scrapeSingleAlbum : ". $url . "\n";
|
|
|
$html = $this->curlContent($url);
|
|
|
$pageContent = $this->queryInstance->html($html);
|
|
|
$pageSize = 3;
|
|
|
// body > div > div.py-2 > nav > ul > li:nth-child(6)
|
|
|
|
|
|
$items = $pageContent->find(".ina p:nth-child(2)")->texts();
|
|
|
$pageItems = $pageContent->find(".py-2 > nav > ul > li:eq(-1) a")->getElements();
|
|
|
// /album/z3x469oa.html?page=5
|
|
|
dump($pageItems[0]->attr("href"));exit;
|
|
|
preg_match_all("#=(\d+)#", $pageItems[0]->attr("href"), $matchContent);
|
|
|
dump($matchContent);
|
|
|
$pageCount = (int)($matchContent[1]);
|
|
|
if ($pageCount <= 0) {
|
|
|
dump("this album is error: " . $url);
|
|
|
Log::error("this album is error: " . $url);
|
|
|
return;
|
|
|
}
|
|
|
$albumName = $pageContent->find("body > div > div.pt-2 > div > div > h1")->texts()[0];
|
|
|
$this->parseContent($this->rootDir . $albumName, $pageContent);
|
|
|
dump("albumName: " . $albumName);
|
|
|
// dump("item: ". $item);
|
|
|
// exit;
|
|
|
for ($i = 1; $i < $pageCount; $i++) {
|
|
|
$pageContent = $this->curlContent($url . "?page=" . $i);
|
|
|
$this->parseContent($this->rootDir . $albumName, $pageContent);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
public function parseContent($dir, $pageContent)
|
|
|
{
|
|
|
if (!is_dir($dir)) {
|
|
|
try {
|
|
|
mkdir($dir);
|
|
|
} catch (Exception $e) {
|
|
|
Log::error($e->getTraceAsString());
|
|
|
return;
|
|
|
}
|
|
|
}
|
|
|
$attrs = $pageContent->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src");
|
|
|
foreach ($attrs as $attr) {
|
|
|
CommonService::downloadImage($dir, $attr);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
public function getEncodeHtmlContent($url)
|
|
|
{
|
|
|
$attempts = 0;
|
|
|
$html = "";
|
|
|
do {
|
|
|
try {
|
|
|
$html = iconv('gb2312', 'UTF-8//IGNORE', file_get_contents($url));
|
|
|
} catch (Exception $e) {
|
|
|
echo $e->getTraceAsString() . "\n";
|
|
|
$sleepTime = 10000 * random_int(100, 1000);
|
|
|
echo "sleep {$sleepTime} nano second \n";
|
|
|
usleep($sleepTime);
|
|
|
$attempts++;
|
|
|
continue;
|
|
|
}
|
|
|
break;
|
|
|
} while ($attempts < 100);
|
|
|
return $this->queryInstance->setHtml($html);
|
|
|
}
|
|
|
|
|
|
public function urlContent($url = "")
|
|
|
{
|
|
|
// dump(parse_url("https://www.v2ph.com/album/am4x838z.html?page=6"));exit;
|
|
|
$urlParts = parse_url($url);
|
|
|
$queryInstance = QueryList::getInstance();
|
|
|
$client = new Client(['base_uri' => 'https://www.v2ph.com']);
|
|
|
|
|
|
$cookieStr = "__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1";
|
|
|
$cookieArr = parseCookieHeader($cookieStr);
|
|
|
$cookieJar = CookieJar::fromArray($cookieArr, ".v2ph.com");
|
|
|
|
|
|
$response = $client->request('GET', !array_key_exists("query", $urlParts) ? $urlParts["path"] : $urlParts["path"] . "?" . $urlParts["query"], [
|
|
|
// $response = $client->request('GET', '/album/am4x838z.html?page=6', [
|
|
|
'cookies' => $cookieJar,
|
|
|
'headers' => [
|
|
|
// 'user-agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
|
|
|
// 'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
|
// 'authority' => 'www.v2ph.com',
|
|
|
// 'referer' => 'https://www.v2ph.com/company/beautyleg',
|
|
|
|
|
|
'authority: www.v2ph.com',
|
|
|
'pragma: no-cache',
|
|
|
'cache-control: no-cache',
|
|
|
'sec-ch-ua: "Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"',
|
|
|
'sec-ch-ua-mobile: ?0',
|
|
|
'upgrade-insecure-requests: 1',
|
|
|
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
|
|
|
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
|
'sec-fetch-site: none',
|
|
|
'sec-fetch-mode: navigate',
|
|
|
'sec-fetch-user: ?1',
|
|
|
'sec-fetch-dest: document',
|
|
|
'accept-language: zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5',
|
|
|
'cookie: __cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _gid=GA1.2.1263326024.1610897479; __cf_bm=71e492fc72ab21fa0b8fad836744d6d4ee44ca16-1610983571-1800-AbpNw81XY8UC9/wkttHDeMEvoRLW9Ej47TMOzfg8UySIsXqAnw5o2BvBsa7ik49xS6LTQ1VtXHfvxN+N7uo7GOuwRq8CCsdTkB10/++FWTJqVSTf6HQtvc1/ftTQZi5nuw==; _ga_170M3FX3HZ=GS1.1.1610983570.3.1.1610983709.0; _ga=GA1.2.6396742.1610804846'
|
|
|
// 'cookie' => '__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1'
|
|
|
],
|
|
|
// 'timeout' => 3.14,
|
|
|
// 'proxy' => 'tcp://localhost:8125',
|
|
|
// 'cert' => ['/path/server.pem', 'password'],
|
|
|
]);
|
|
|
|
|
|
$body = $response->getBody();
|
|
|
$html = (string)$body;
|
|
|
return $html;
|
|
|
echo $html;
|
|
|
$attrs = $this->queryInstance->html($html)->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src");
|
|
|
// foreach ($attrs as $attr) {
|
|
|
// CommonService::downloadImage("/Users/shixuesen/Pictures/bl", $attr);
|
|
|
// }
|
|
|
// = $content->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src");
|
|
|
dump($attrs);
|
|
|
}
|
|
|
|
|
|
public function curlContent($url = "")
|
|
|
{
|
|
|
$curl = curl_init();
|
|
|
|
|
|
curl_setopt_array($curl, array(
|
|
|
CURLOPT_URL => $url,
|
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
|
CURLOPT_ENCODING => '',
|
|
|
CURLOPT_MAXREDIRS => 10,
|
|
|
CURLOPT_TIMEOUT => 0,
|
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
|
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
|
|
|
CURLOPT_CUSTOMREQUEST => 'GET',
|
|
|
CURLOPT_HTTPHEADER => array(
|
|
|
'authority: www.v2ph.com',
|
|
|
'pragma: no-cache',
|
|
|
'cache-control: no-cache',
|
|
|
'sec-ch-ua: "Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"',
|
|
|
'sec-ch-ua-mobile: ?0',
|
|
|
'upgrade-insecure-requests: 1',
|
|
|
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
|
|
|
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
|
'sec-fetch-site: none',
|
|
|
'sec-fetch-mode: navigate',
|
|
|
'sec-fetch-user: ?1',
|
|
|
'sec-fetch-dest: document',
|
|
|
'accept-language: zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5',
|
|
|
'cookie: __cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _gid=GA1.2.1263326024.1610897479; __cf_bm=71e492fc72ab21fa0b8fad836744d6d4ee44ca16-1610983571-1800-AbpNw81XY8UC9/wkttHDeMEvoRLW9Ej47TMOzfg8UySIsXqAnw5o2BvBsa7ik49xS6LTQ1VtXHfvxN+N7uo7GOuwRq8CCsdTkB10/++FWTJqVSTf6HQtvc1/ftTQZi5nuw==; _ga_170M3FX3HZ=GS1.1.1610983570.3.1.1610983709.0; _ga=GA1.2.6396742.1610804846'
|
|
|
),
|
|
|
));
|
|
|
|
|
|
$response = curl_exec($curl);
|
|
|
|
|
|
curl_close($curl);
|
|
|
return $response;
|
|
|
}
|
|
|
|
|
|
}
|