You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

374 lines
16 KiB

<?php
namespace App\Services;
use App\ImageRecord;
use Exception;
use ErrorException;
use Log;
use QL\QueryList;
class XiurenjiService
{
private $name_dir = [
"xiuren" => [
"path" => "XiuRen/",
"dir" => "xiuren/"
],
"xiaoyu" => [
"path" => "XiaoYu/",
"dir" => "xiaoyu/"
],
"youwu" => [
"path" => "YouWu/",
"dir" => "youwu/"
],
"mygirl" => [
"path" => "MyGirl/",
"dir" => "mygirl/"
],
"huayang" => [
"path" => "HuaYang/",
"dir" => "huayang/"
],
"mfstar" => [
"path" => "MFStar/",
"dir" => "mfstar/"
],
"imiss" => [
"path" => "IMiss/",
"dir" => "imiss/"
]
];
public $domainUrl = "https://www.xiurenb.net/";
public $xiurenRootUrl = "https://www.xiurenb.net/XiuRen/";
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
// public $rootDir = "/Volumes/Backup/images/xiuren/";
public $rootDir = "/Volumes/Crucial X6/Image/xr/";
public $queryInstance;
public $queryNew;
public function __construct()
{
$this->queryInstance = QueryList::getInstance();
$this->queryNew = new QueryList();
}
public function scrapeAll()
{
foreach ($this->name_dir as $key => $value) {
# code...
dump("current site: " . $key);
$this->scrapeAlbum($key, 20);
}
// $this->scrapeAlbum("xiuren", 20);
// $this->scrapeAlbum("xiaoyu", 20);
// $this->scrapeAlbum("youwu", 20);
// $this->scrapeAlbum("mygirl", 20);
// $this->scrapeAlbum("huayang", 20);
// $this->scrapeAlbum("mfstar", 20);
// $this->scrapeAlbum("imiss", 20);
}
public function scrapeAlbum($path, $num = 20, $startPage = 0)
{
$pageSize = 20;
$urlPath = $this->name_dir[$path]["path"];
$rootDir = $this->rootDir;
$this->rootDir = $this->rootDir . $this->name_dir[$path]["dir"];
$pageCount = $this->getEncodeHtmlContent("https://www.xiurenb.net/$urlPath/index.html")->find(".page span strong")->htmls()->get(0);
dump("current site item count: " . $pageCount);
if ((int)$pageCount > 0) {
$pageCount = min($pageCount, $num);
for ($i = $startPage; $i <= ceil($pageCount / $pageSize); $i++) {
$urlSuffix = "";
if ($i == 0) {
$urlSuffix = "index.html";
} else {
$urlSuffix = "index" . $i . ".html";
}
$this->scrapePageAlbum($this->domainUrl . $urlPath . $urlSuffix);
}
}
$this->rootDir = $rootDir;
}
public function scrapePageAlbum($url)
{
$pageContent = $this->getEncodeHtmlContent($url);
$items = $pageContent->find(".i_list a")->getElements();
$i = 0;
foreach ($items as $item) {
// $i++;
// if ($i < 18) {
// continue;
// # code...
// }
dump($item->getAttribute("href"));
$this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href"));
}
}
public function scrapeSingleAlbum($url)
{
Log::info("scrapeSingleAlbum $url");
$pageContent = $this->getEncodeHtmlContent($url);
$albumName = $pageContent->find(".item_title h1")->htmls()->get(0);
$pageItems = $pageContent->find(".content:eq(0) .page a")->attrs("href")->all();
if ($this->checkAlbumHasDownload($albumName)) {
Log::info("已经下载过了,相册名:" . $albumName);
return;
}
dump("当前相册名: " . $albumName);
$imageNo = 1;
$description = null;
$this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo, $description);
$pageItems = array_slice($pageItems, 2, count($pageItems) - 3);
foreach ($pageItems as $item) {
$pageContent = $this->getEncodeHtmlContent($this->domainUrl . $item);
$this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo, $description);
}
}
public function parseContent($dir, $pageContent, &$imageNo, &$description)
{
if (!is_dir($dir)) {
try {
mkdir($dir);
} catch (Exception $e) {
Log::error($e->getMessage());
return;
}
}
$images = $pageContent->find(".content p img")->getElements();
$user = $pageContent->find(".item_info div a:eq(-1) span")->htmls()->get(0);
if ($description == null) {
$metas = $pageContent->find("meta")->getElements();
foreach ($metas as $meta) {
$name = $meta->getAttribute("name");
if (trim($name) == "description") {
$description = $meta->getAttribute("content");
break;
}
}
dump("description: " . $description);
}
if ($imageNo == 1) {
// 只在每个相册第一次输出名字
dump("user is " . $user);
}
foreach ($images as $image) {
usleep(random_int(10, 100) * 100);
$imageUrl = $image->getAttribute("src");
$trueImageUrl = "https://www.xiurenb.net" . $imageUrl;
$fileInfo = pathinfo($trueImageUrl);
if (file_exists($dir . DIRECTORY_SEPARATOR . $fileInfo["filename"] . ".jpg")) {
rename($dir . DIRECTORY_SEPARATOR . $fileInfo["filename"] . ".jpg", $dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".jpg");
$imageNo++;
continue;
}
if (file_exists($dir . DIRECTORY_SEPARATOR . $fileInfo["filename"] . ".webp")) {
rename($dir . DIRECTORY_SEPARATOR . $fileInfo["filename"] . ".webp", $dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".webp");
$imageNo++;
continue;
}
// 情况 2 缺少 user
if (file_exists($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".jpg")) {
rename($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".jpg", $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg");
$imageNo++;
continue;
}
if (file_exists($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".webp")) {
rename($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["filename"] . ".webp", $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".webp");
$imageNo++;
continue;
}
// 情况 3 缺少描述
if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg")) {
if ($description != null) {
rename($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg", $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg");
}
$imageNo++;
continue;
}
if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".webp")) {
if ($description != null) {
rename($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["filename"] . ".webp", $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".webp");
}
$imageNo++;
continue;
}
// 情况 4 已存在则返回
if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg")) {
# code...
$imageNo++;
continue;
}
if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"])) {
# code...
$imageNo++;
continue;
}
// 情况 5 user 为空
if (file_exists($dir . DIRECTORY_SEPARATOR . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg")) {
rename($dir . DIRECTORY_SEPARATOR . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg", $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["filename"] . ".jpg");
$imageNo++;
continue;
}
if (file_exists($dir . DIRECTORY_SEPARATOR . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"])) {
rename($dir . DIRECTORY_SEPARATOR . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"]);
$imageNo++;
continue;
}
// dump($fileInfo);
$attempts = 0;
$content = "";
do {
try {
$curl_handle = curl_init();
curl_setopt_array($curl_handle, array(
CURLOPT_URL => $trueImageUrl,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => '',
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 0,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => 'GET',
CURLOPT_HTTPHEADER => array(
'Connection: keep-alive',
'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
'sec-ch-ua-mobile: ?0',
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36',
'sec-ch-ua-platform: "macOS"',
'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
'Sec-Fetch-Site: same-origin',
'Sec-Fetch-Mode: no-cors',
'Sec-Fetch-Dest: image',
'Accept-Language: zh-CN,zh;q=0.9',
'Cookie: UM_distinctid=17e8fc4c12917-0742b5d542c2af-133a6253-13c680-17e8fc4c12a924; ASPSESSIONIDCWCCSCAC=EODDLCJCADBNDFDGCMALGMKO; CNZZDATA1278618868=1237248404-1643081659-%7C1646134190; ASPSESSIONIDCWCDTDAD=HAJGDPOCNBIKMMMNLCENPLAM'
),
));
$content = curl_exec($curl_handle);
if ($content === false) {
$le = new Exception("get image has error: " . curl_error($curl_handle));
curl_close($curl_handle);
throw $le;
}
curl_close($curl_handle);
// $content = file_get_contents($trueImageUrl);
} catch (ErrorException | Exception $e) {
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts++;
continue;
}
break;
} while ($attempts < 100);
if ($content != "") {
dump("new file is " .$dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"]);
file_put_contents($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $description . "-" . $imageNo . "-" . $fileInfo["basename"], $content);
} else {
Log::error("image content is empty " . $trueImageUrl);
}
$imageNo++;
// dump("current imageNo: " . $imageNo);
}
}
public function getEncodeHtmlContent($url)
{
$attempts = 0;
$html = "";
$arrContextOptions = array(
"ssl" => array(
"allow_self_signed" => true,
"verify_peer" => false,
"verify_peer_name" => false,
),
);
do {
try {
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => 'UTF-8',
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 0,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => 'GET',
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_HTTPHEADER => array(
'authority: www.xiurenji.net',
'pragma: no-cache',
'cache-control: no-cache',
'sec-ch-ua: " Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile: ?0',
'upgrade-insecure-requests: 1',
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site: same-origin',
'sec-fetch-mode: navigate',
'sec-fetch-user: ?1',
'sec-fetch-dest: document',
'referer: https://www.xiurenji.net/XiuRen/',
'accept-language: zh-CN,zh;q=0.9',
'cookie: UM_distinctid=177fd93a0ca93c-06b94658d5d337-121a4759-13c680-177fd93a0cbcaf; ASPSESSIONIDCATDQACD=FDPMPCLAMHNCPJFCBLKFLCKH; CNZZDATA1278618868=367774893-1614867004-%7C1625926983; __51cke__=; __tins__20641871=%7B%22sid%22%3A%201625931982756%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201625933829110%7D; __51laig__=7'
),
));
$response = curl_exec($curl);
$error = curl_error($curl);
echo $error;
curl_close($curl);
// echo $response;
// $html = iconv('gb2312','UTF-8//IGNORE', $response);
$html = iconv_gbk_to_uft8($response);
$this->queryNew->setHtml($response);
// $html = $response;
} catch (Exception $e) {
echo $e->getMessage() . "\n";
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts++;
continue;
}
break;
} while ($attempts < 100);
// dump("current url: " . $url);
return $this->queryInstance->setHtml($html);
}
private function checkAlbumHasDownload($albumName)
{
$record = ImageRecord::where("name", $albumName)->first();
if ($record != null) {
return true;
}
}
}