You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

268 lines
14 KiB

<?php
namespace App\Services;
use GuzzleHttp\Client;
use GuzzleHttp\Cookie\CookieJar;
use GuzzleHttp\Cookie\SetCookie;
use Illuminate\Support\Facades\Log;
use QL\QueryList;
use function Zend\Diactoros\parseCookieHeader;
class CommonScrapeService
{
public function scrapeBLImage()
{
$sourceUrl = "http://img11.tu11.com:8080/uploads/20201125/202011251846611";
for ($i = 39; $i < 100; $i++) {
$content = file_get_contents($sourceUrl . $i . ".jpg");
file_put_contents("/Users/shixuesen/Pictures/bl/" . pathinfo($sourceUrl . $i . ".jpg", PATHINFO_BASENAME), $content);
}
}
public function scrapeBeautyLegImage()
{
$queryInstance = QueryList::getInstance();
// $content = $queryInstance->get("https://www.v2ph.com/album/ax86795a.html?page=2");
$client = new Client(['base_uri' => 'https://www.v2ph.com']);
$cookieStr = "__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1";
$cookieArr = parseCookieHeader($cookieStr);
$cookieJar = CookieJar::fromArray($cookieArr, ".v2ph.com");
// dump($cookieJar->getCookieByName("__cfduid")->getDomain());exit;
$response = $client->request('GET', '/album/am4x838z.html?page=6', [
'cookies' => $cookieJar,
'headers' => [
'user-agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'authority' => 'www.v2ph.com',
'referer' => 'https://www.v2ph.com/company/beautyleg',
// 'cookie' => '__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1'
],
// 'timeout' => 3.14,
// 'proxy' => 'tcp://localhost:8125',
// 'cert' => ['/path/server.pem', 'password'],
]);
$body = $response->getBody();
$html = (string)$body;
echo $html;
$attrs = $this->queryInstance->html($html)->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src");
// foreach ($attrs as $attr) {
// CommonService::downloadImage("/Users/shixuesen/Pictures/bl", $attr);
// }
// = $content->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src");
dump($attrs);
}
public $domainUrl = "https://www.v2ph.com";
public $albumRootUrl = "https://www.v2ph.com/company/beautyleg";
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
public $rootDir = "/Volumes/Backup/images/beautyleg/";
public $queryInstance;
public function __construct()
{
$this->queryInstance = QueryList::getInstance();
}
public function scrapeAlbum()
{
$pageSize = 16;
$html = $this->curlContent("https://www.v2ph.com/company/beautyleg");
echo $html;exit;
$pageCount = $this->queryInstance->html($html)->find("body > div.container.main-wrap > div.pt-2 > div.py-2.text-center > span")->htmls()->get(0);
print_r($pageCount);
if ((int)$pageCount > 0) {
for ($i = 1; $i <= ceil($pageCount / $pageSize); $i++) {
$urlSuffix = "";
if ($i == 1) {
$urlSuffix = "";
} else {
$urlSuffix = "?page=" . $i;
}
$this->scrapePageAlbum($this->albumRootUrl . $urlSuffix);
exit;
}
}
}
public function scrapePageAlbum($url)
{
echo "scrapePageAlbum : ". $url . "\n";
$html = $this->curlContent($url);
// echo $html;
$pageContent = $this->queryInstance->html($html);
dump($pageContent);
$items = $pageContent->find(".media-cover")->getElements();
dump($items);exit;
foreach ($items as $item) {
dump($item->getAttribute("href"));
// $this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href"));
}
}
public function scrapeSingleAlbum($url)
{
echo "scrapeSingleAlbum : ". $url . "\n";
$html = $this->curlContent($url);
$pageContent = $this->queryInstance->html($html);
$pageSize = 3;
// body > div > div.py-2 > nav > ul > li:nth-child(6)
$items = $pageContent->find(".ina p:nth-child(2)")->texts();
$pageItems = $pageContent->find(".py-2 > nav > ul > li:eq(-1) a")->getElements();
// /album/z3x469oa.html?page=5
dump($pageItems[0]->attr("href"));exit;
preg_match_all("#=(\d+)#", $pageItems[0]->attr("href"), $matchContent);
dump($matchContent);
$pageCount = (int)($matchContent[1]);
if ($pageCount <= 0) {
dump("this album is error: " . $url);
Log::error("this album is error: " . $url);
return;
}
$albumName = $pageContent->find("body > div > div.pt-2 > div > div > h1")->texts()[0];
$this->parseContent($this->rootDir . $albumName, $pageContent);
dump("albumName: " . $albumName);
// dump("item: ". $item);
// exit;
for ($i = 1; $i < $pageCount; $i++) {
$pageContent = $this->curlContent($url . "?page=" . $i);
$this->parseContent($this->rootDir . $albumName, $pageContent);
}
}
public function parseContent($dir, $pageContent)
{
if (!is_dir($dir)) {
try {
mkdir($dir);
} catch (Exception $e) {
Log::error($e->getTraceAsString());
return;
}
}
$attrs = $pageContent->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src");
foreach ($attrs as $attr) {
CommonService::downloadImage($dir, $attr);
}
}
public function getEncodeHtmlContent($url)
{
$attempts = 0;
$html = "";
do {
try {
$html = iconv('gb2312', 'UTF-8//IGNORE', file_get_contents($url));
} catch (Exception $e) {
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts++;
continue;
}
break;
} while ($attempts < 100);
return $this->queryInstance->setHtml($html);
}
public function urlContent($url = "")
{
// dump(parse_url("https://www.v2ph.com/album/am4x838z.html?page=6"));exit;
$urlParts = parse_url($url);
$queryInstance = QueryList::getInstance();
$client = new Client(['base_uri' => 'https://www.v2ph.com']);
$cookieStr = "__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1";
$cookieArr = parseCookieHeader($cookieStr);
$cookieJar = CookieJar::fromArray($cookieArr, ".v2ph.com");
$response = $client->request('GET', !array_key_exists("query", $urlParts) ? $urlParts["path"] : $urlParts["path"] . "?" . $urlParts["query"], [
// $response = $client->request('GET', '/album/am4x838z.html?page=6', [
'cookies' => $cookieJar,
'headers' => [
// 'user-agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
// 'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
// 'authority' => 'www.v2ph.com',
// 'referer' => 'https://www.v2ph.com/company/beautyleg',
'authority: www.v2ph.com',
'pragma: no-cache',
'cache-control: no-cache',
'sec-ch-ua: "Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"',
'sec-ch-ua-mobile: ?0',
'upgrade-insecure-requests: 1',
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site: none',
'sec-fetch-mode: navigate',
'sec-fetch-user: ?1',
'sec-fetch-dest: document',
'accept-language: zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5',
'cookie: __cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _gid=GA1.2.1263326024.1610897479; __cf_bm=71e492fc72ab21fa0b8fad836744d6d4ee44ca16-1610983571-1800-AbpNw81XY8UC9/wkttHDeMEvoRLW9Ej47TMOzfg8UySIsXqAnw5o2BvBsa7ik49xS6LTQ1VtXHfvxN+N7uo7GOuwRq8CCsdTkB10/++FWTJqVSTf6HQtvc1/ftTQZi5nuw==; _ga_170M3FX3HZ=GS1.1.1610983570.3.1.1610983709.0; _ga=GA1.2.6396742.1610804846'
// 'cookie' => '__cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; _gid=GA1.2.22159331.1610804846; __cf_bm=19f3172d7b8d3c97ce28f86ec56af4da3c3560a2-1610805781-1800-Adr3Ph9l2oeHE9Ms+YrTrKfE1uwMO7Tpcw/WxfRrPldspHy/AcwPovSpDcrz+DE3UHWhrw60voIaOsPy3VvpvTV9fjU9F1Hk3y1U5O6V1RZeclh6+YoIpZc1UfRyixPrKw==; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _ga_170M3FX3HZ=GS1.1.1610804845.1.1.1610806191.0; _ga=GA1.2.6396742.1610804846; _gat_UA-140713725-1=1'
],
// 'timeout' => 3.14,
// 'proxy' => 'tcp://localhost:8125',
// 'cert' => ['/path/server.pem', 'password'],
]);
$body = $response->getBody();
$html = (string)$body;
return $html;
echo $html;
$attrs = $this->queryInstance->html($html)->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src");
// foreach ($attrs as $attr) {
// CommonService::downloadImage("/Users/shixuesen/Pictures/bl", $attr);
// }
// = $content->find("img.img-fluid.album-photo.d-block.mx-auto")->attrs("data-src");
dump($attrs);
}
public function curlContent($url = "")
{
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => '',
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 0,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => 'GET',
CURLOPT_HTTPHEADER => array(
'authority: www.v2ph.com',
'pragma: no-cache',
'cache-control: no-cache',
'sec-ch-ua: "Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"',
'sec-ch-ua-mobile: ?0',
'upgrade-insecure-requests: 1',
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site: none',
'sec-fetch-mode: navigate',
'sec-fetch-user: ?1',
'sec-fetch-dest: document',
'accept-language: zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5',
'cookie: __cfduid=d959b19dbaa25a5fac7c5fe1d3988d15e1610778634; frontend=36ab879e6d75ae4e19e4843a1bdf2e45; frontend-rmu=qUl6oIQSib76RH6A4ZJGALhooK63; frontend-rmt=3d1i%2F0DU5cuBB%2BII5MzosWl0MPonPRchZyJyhL73PzaHWBzoq9RFDWJG%2FDPtCncI; _gid=GA1.2.1263326024.1610897479; __cf_bm=71e492fc72ab21fa0b8fad836744d6d4ee44ca16-1610983571-1800-AbpNw81XY8UC9/wkttHDeMEvoRLW9Ej47TMOzfg8UySIsXqAnw5o2BvBsa7ik49xS6LTQ1VtXHfvxN+N7uo7GOuwRq8CCsdTkB10/++FWTJqVSTf6HQtvc1/ftTQZi5nuw==; _ga_170M3FX3HZ=GS1.1.1610983570.3.1.1610983709.0; _ga=GA1.2.6396742.1610804846'
),
));
$response = curl_exec($curl);
curl_close($curl);
return $response;
}
}