<?php
|
|
|
|
|
|
namespace App\Services;
|
|
|
|
|
|
use App\ImageRecord;
|
|
use Exception;
|
|
use ErrorException;
|
|
use Log;
|
|
use QL\QueryList;
|
|
|
|
class XiaoyuService
|
|
{
|
|
public $domainUrl = "https://www.xiurenb.net/";
|
|
public $xiurenRootUrl = "https://www.xiurenb.net/XiaoYu/";
|
|
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
|
|
// public $rootDir = "/Volumes/Backup/images/xiuren/";
|
|
public $rootDir = "/Volumes/Crucial X6/Image/xiaoyu/";
|
|
public $queryInstance;
|
|
public $queryNew;
|
|
|
|
public function __construct()
|
|
{
|
|
$this->queryInstance = QueryList::getInstance();
|
|
$this->queryNew = new QueryList();
|
|
}
|
|
|
|
public function scrapeAlbum()
|
|
{
|
|
$pageSize = 20;
|
|
$pageCount = $this->getEncodeHtmlContent("https://www.xiurenb.net/XiaoYu/index.html")->find(".page span strong")->htmls()->get(0);
|
|
print_r($pageCount);
|
|
if ((int)$pageCount > 0) {
|
|
$pageCount = 20;
|
|
for ($i = 0; $i <= ceil($pageCount / $pageSize); $i++) {
|
|
$urlSuffix = "";
|
|
if ($i == 0) {
|
|
$urlSuffix = "index.html";
|
|
} else {
|
|
$urlSuffix = "index" . $i . ".html";
|
|
}
|
|
$this->scrapePageAlbum($this->xiurenRootUrl . $urlSuffix);
|
|
}
|
|
}
|
|
}
|
|
|
|
public function scrapePageAlbum($url)
|
|
{
|
|
|
|
$pageContent = $this->getEncodeHtmlContent($url);
|
|
$items = $pageContent->find(".i_list a")->getElements();
|
|
$i = 0;
|
|
foreach ($items as $item) {
|
|
// $i++;
|
|
// if ($i < 11) {
|
|
// continue;
|
|
// # code...
|
|
// }
|
|
dump($item->getAttribute("href"));
|
|
$this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href"));
|
|
}
|
|
}
|
|
|
|
public function scrapeSingleAlbum($url)
|
|
{
|
|
Log::info("scrapeSingleAlbum $url");
|
|
$pageContent = $this->getEncodeHtmlContent($url);
|
|
$albumName = $pageContent->find(".item_title h1")->htmls()->get(0);
|
|
$pageItems = $pageContent->find(".content:eq(0) .page a")->attrs("href")->all();
|
|
if ($this->checkAlbumHasDownload($albumName)) {
|
|
Log::info("已经下载过了,相册名:" . $albumName);
|
|
return;
|
|
}
|
|
$imageNo = 1;
|
|
$this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo);
|
|
$pageItems = array_slice($pageItems, 2, count($pageItems) - 3);
|
|
foreach ($pageItems as $item) {
|
|
$pageContent = $this->getEncodeHtmlContent($this->domainUrl . $item);
|
|
$this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo);
|
|
}
|
|
}
|
|
|
|
public function parseContent($dir, $pageContent, &$imageNo)
|
|
{
|
|
if (!is_dir($dir)) {
|
|
try {
|
|
mkdir($dir);
|
|
} catch (Exception $e) {
|
|
Log::error($e->getMessage());
|
|
return;
|
|
}
|
|
}
|
|
$images = $pageContent->find(".content p img")->getElements();
|
|
$user = $pageContent->find(".item_info div a:eq(-1) span")->htmls()->get(0);
|
|
dump("user is " . $user);
|
|
foreach ($images as $image) {
|
|
usleep(random_int(10, 100) * 100);
|
|
$imageUrl = $image->getAttribute("src");
|
|
$trueImageUrl = "https://www.xiurenji.net" . $imageUrl;
|
|
$fileInfo = pathinfo($trueImageUrl);
|
|
if (file_exists($dir . DIRECTORY_SEPARATOR . $fileInfo["basename"])) {
|
|
rename($dir . DIRECTORY_SEPARATOR . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"]);
|
|
$imageNo++;
|
|
continue;
|
|
}
|
|
if (file_exists($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"])) {
|
|
rename($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"]);
|
|
$imageNo++;
|
|
continue;
|
|
}
|
|
if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"])) {
|
|
$imageNo++;
|
|
continue;
|
|
}
|
|
dump($fileInfo);
|
|
$attempts = 0;
|
|
$content = "";
|
|
do {
|
|
try {
|
|
$curl_handle = curl_init();
|
|
curl_setopt($curl_handle, CURLOPT_URL, $trueImageUrl);
|
|
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 20000);
|
|
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
|
|
curl_setopt($curl_handle, CURLOPT_REFERER, $this->xiurenRootUrl);
|
|
curl_setopt($curl_handle, CURLOPT_FOLLOWLOCATION, true);
|
|
curl_setopt($curl_handle, CURLOPT_ENCODING, '');
|
|
curl_setopt($curl_handle, CURLOPT_MAXREDIRS, 10);
|
|
curl_setopt($curl_handle, CURLOPT_TIMEOUT, 0);
|
|
curl_setopt($curl_handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
|
|
curl_setopt($curl_handle, CURLOPT_CUSTOMREQUEST, 'GET');
|
|
curl_setopt($curl_handle, CURLOPT_HTTPHEADER, array(
|
|
'authority: www.xiurenji.net',
|
|
'pragma: no-cache',
|
|
'cache-control: no-cache',
|
|
'sec-ch-ua: "Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
|
|
'sec-ch-ua-mobile: ?0',
|
|
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
|
|
'sec-ch-ua-platform: "macOS"',
|
|
'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
|
|
'sec-fetch-site: same-origin',
|
|
'sec-fetch-mode: no-cors',
|
|
'sec-fetch-dest: image',
|
|
'referer: https://www.xiurenji.net/XiuRen/9483.html',
|
|
'accept-language: zh-CN,zh;q=0.9',
|
|
'cookie: UM_distinctid=17cfa8bea8eb9e-0dd0c6d032d0fc-1c306851-13c680-17cfa8bea8fc85; CNZZDATA1278618868=1505121253-1636283360-%7C1636283360; __51cke__=; ASPSESSIONIDQAQAATSQ=LBLGNPMDHKKMNOPDBCEAPIMH; __tins__20641871=%7B%22sid%22%3A%201636291046220%2C%20%22vd%22%3A%202%2C%20%22expires%22%3A%201636292852634%7D; __51laig__=2'
|
|
));
|
|
$content = curl_exec($curl_handle);
|
|
if ($content === false) {
|
|
$le = new Exception("get image has error: " . curl_error($curl_handle));
|
|
curl_close($curl_handle);
|
|
throw $le;
|
|
}
|
|
curl_close($curl_handle);
|
|
// $content = file_get_contents($trueImageUrl);
|
|
} catch (ErrorException|Exception $e) {
|
|
echo $e->getTraceAsString() . "\n";
|
|
$sleepTime = 10000 * random_int(100, 1000);
|
|
echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$attempts++;
|
|
continue;
|
|
}
|
|
break;
|
|
} while ($attempts < 100);
|
|
if ($content != "") {
|
|
file_put_contents($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"], $content);
|
|
} else {
|
|
Log::error("image content is empty " . $trueImageUrl);
|
|
}
|
|
$imageNo++;
|
|
// dump("current imageNo: " . $imageNo);
|
|
}
|
|
}
|
|
|
|
public function getEncodeHtmlContent($url)
|
|
{
|
|
$attempts = 0;
|
|
$html = "";
|
|
$arrContextOptions = array(
|
|
"ssl" => array(
|
|
"allow_self_signed" => true,
|
|
"verify_peer" => false,
|
|
"verify_peer_name" => false,
|
|
),
|
|
);
|
|
do {
|
|
try {
|
|
$curl = curl_init();
|
|
|
|
curl_setopt_array($curl, array(
|
|
CURLOPT_URL => $url,
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_ENCODING => 'UTF-8',
|
|
CURLOPT_MAXREDIRS => 10,
|
|
CURLOPT_TIMEOUT => 0,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
|
|
CURLOPT_CUSTOMREQUEST => 'GET',
|
|
CURLOPT_SSL_VERIFYPEER => false,
|
|
CURLOPT_SSL_VERIFYHOST => false,
|
|
|
|
CURLOPT_HTTPHEADER => array(
|
|
'authority: www.xiurenji.net',
|
|
'pragma: no-cache',
|
|
'cache-control: no-cache',
|
|
'sec-ch-ua: " Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
|
|
'sec-ch-ua-mobile: ?0',
|
|
'upgrade-insecure-requests: 1',
|
|
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
|
|
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
'sec-fetch-site: same-origin',
|
|
'sec-fetch-mode: navigate',
|
|
'sec-fetch-user: ?1',
|
|
'sec-fetch-dest: document',
|
|
'referer: https://www.xiurenji.net/XiuRen/',
|
|
'accept-language: zh-CN,zh;q=0.9',
|
|
'cookie: UM_distinctid=177fd93a0ca93c-06b94658d5d337-121a4759-13c680-177fd93a0cbcaf; ASPSESSIONIDCATDQACD=FDPMPCLAMHNCPJFCBLKFLCKH; CNZZDATA1278618868=367774893-1614867004-%7C1625926983; __51cke__=; __tins__20641871=%7B%22sid%22%3A%201625931982756%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201625933829110%7D; __51laig__=7'
|
|
),
|
|
));
|
|
|
|
$response = curl_exec($curl);
|
|
$error = curl_error($curl);
|
|
echo $error;
|
|
curl_close($curl);
|
|
|
|
// echo $response;
|
|
// $html = iconv('gb2312','UTF-8//IGNORE', $response);
|
|
$html = iconv_gbk_to_uft8($response);
|
|
$this->queryNew->setHtml($response);
|
|
// $html = $response;
|
|
} catch (Exception $e) {
|
|
echo $e->getMessage() . "\n";
|
|
echo $e->getTraceAsString() . "\n";
|
|
$sleepTime = 10000 * random_int(100, 1000);
|
|
echo "sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$attempts++;
|
|
continue;
|
|
}
|
|
break;
|
|
} while ($attempts < 100);
|
|
dump("current url: " . $url);
|
|
return $this->queryInstance->setHtml($html);
|
|
}
|
|
|
|
|
|
private function checkAlbumHasDownload($albumName)
|
|
{
|
|
$record = ImageRecord::where("name", $albumName)->first();
|
|
if ($record != null) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|