<?php
|
|
|
|
|
|
namespace App\Services;
|
|
|
|
|
|
use Exception;
|
|
use QL\QueryList;
|
|
|
|
class TujiguService
|
|
{
|
|
|
|
private static $name_dir = [
|
|
"周韦彤" => [
|
|
"dir" => "周韦彤",
|
|
"code" => 1456,
|
|
"name" => "周韦彤"
|
|
],
|
|
"ycc" => [
|
|
"dir" => "ycc",
|
|
"code" => 459,
|
|
"name" => "杨晨晨"
|
|
],
|
|
"ry" => [
|
|
"dir" => "忍野さら",
|
|
"code" => "1875",
|
|
"name" => "忍野さら"
|
|
],
|
|
"azu" => [
|
|
"dir" => "azu",
|
|
"code" => 437,
|
|
"name" => "阿朱"
|
|
],
|
|
"xq" => [
|
|
"dir" => "xq",
|
|
"code" => 2438,
|
|
"name" => "小琪"
|
|
],
|
|
"ygh" => [
|
|
"dir" => "ygh",
|
|
"code" => 550,
|
|
"name" => "原干惠"
|
|
],
|
|
"wyc" => [
|
|
"dir" => "wyc",
|
|
"code" => 293,
|
|
"name" => "王语纯"
|
|
],
|
|
"zz" => [
|
|
"dir" => "zz",
|
|
"code" => 954,
|
|
"name" => "芝芝 booty"
|
|
],
|
|
"hlr" => [
|
|
"dir" => "hlr",
|
|
"code" => 1289,
|
|
"name" => "黄乐然"
|
|
],
|
|
"jrq" => [
|
|
"dir" => "jrq",
|
|
"code" => 5034,
|
|
"name" => "姜仁卿"
|
|
],
|
|
"ny" => [
|
|
"dir" => "ny",
|
|
"code" => 5301,
|
|
"name" => "奈月"
|
|
],
|
|
"杉本有美" => [
|
|
"dir" => "杉本有美",
|
|
"code" => 632,
|
|
"name" => "杉本有美"
|
|
],
|
|
"糯美子" => [
|
|
"dir" => "糯美子",
|
|
"code" => 161,
|
|
"name" => "糯美子"
|
|
],
|
|
"小雪" => [
|
|
"dir" => "小雪",
|
|
"code" => 388,
|
|
"name" => "小雪"
|
|
]
|
|
|
|
];
|
|
|
|
public function scrapeTujiguGirls($isAll = false)
|
|
{
|
|
// self::$name_dir = array_slice(self::$name_dir, 1, 1);
|
|
foreach (self::$name_dir as $username => $name) {
|
|
// 重置下这个 map,不同网站,不同人可能会有相册 id 重复的情况
|
|
$albumCodeMap = [];
|
|
// $baseDir = "/Users/shixuesen/Documents/tmp/image/xg/" . $name['dir'] . "/";
|
|
$baseDir = "/Volumes/intel660p/image/xg/" . $name["dir"] . "/";
|
|
// 处理两个网站对应图册名不一致,直接取图册 id 作为唯一性判断的 map
|
|
if (is_dir($baseDir)) {
|
|
$albumDirList = scandir($baseDir);
|
|
foreach ($albumDirList as $albumDir) {
|
|
if ($albumDir == "." || $albumDir == "..") {
|
|
continue;
|
|
}
|
|
if (is_dir($baseDir . $albumDir) && is_numeric(explode("-", $albumDir)[0])) {
|
|
$albumCodeMap[explode("-", $albumDir)[0]] = $albumDir;
|
|
}
|
|
}
|
|
}
|
|
$baseUrl = "https://www.tujidao01.com/";
|
|
usleep(random_int(100, 1000) * 1000);
|
|
$peopleUrl = "https://www.tujidao01.com/t/?id=";
|
|
$peopleUrl .= $name['code'];
|
|
// 获取总的相册数量
|
|
$albumNumSelector = "body > div:nth-child(4) > span";
|
|
$albumNumSelector = "#pages > div > a:last-child";
|
|
echo "111111111";
|
|
$content = null;
|
|
try {
|
|
$content = $this->getContent($peopleUrl);
|
|
$baseQl = QueryList::getInstance()->setHtml($content);
|
|
} catch (Exception $e) {
|
|
dump($e->getMessage());
|
|
}
|
|
echo "222222222";
|
|
|
|
$ql = $baseQl->find($albumNumSelector)->attrs("href");
|
|
$onlyOnePage = false;
|
|
// 相册页数
|
|
if (count($ql->all()) == 0) {
|
|
// 不超过 1 页时,不显示数量
|
|
$totalAlbumPage = 1;
|
|
$onlyOnePage = true;
|
|
} else {
|
|
preg_match("#page=(\d+)#", $ql->all()[0], $result);
|
|
dump($result);
|
|
$totalAlbumPage = 1;
|
|
if (is_numeric($result[1])) {
|
|
$totalAlbumPage = $result[1];
|
|
}
|
|
}
|
|
$baseAlbumUrl = "https://www.tujidao01.com/t/?id={$name['code']}&page=";
|
|
|
|
// 表示限制抓取相册数
|
|
$countLimit = 8;
|
|
if ($isAll) {
|
|
$countLimit = 5000;
|
|
}
|
|
echo "totalAlbumPage is $totalAlbumPage\n";
|
|
for ($i = 1; $i <= $totalAlbumPage; $i++) {
|
|
if ($onlyOnePage || $totalAlbumPage == 1 || $i == 1) {
|
|
$albumQl = QueryList::getInstance()->setHtml($content);
|
|
} else {
|
|
$content = $this->getContent($baseAlbumUrl . $i);
|
|
$albumQl = QueryList::getInstance()->setHtml($content);
|
|
}
|
|
// dump($albumQl->getHtml());
|
|
$rules = [
|
|
'num' => ['span.shuliang', 'text'],
|
|
'title' => ['p.biaoti', 'text'],
|
|
'img' => ['a>img', 'src'],
|
|
"code" => ['p.biaoti a', "href"]
|
|
];
|
|
$range = "div.hezi>ul li";
|
|
// "body > div.hezi > ul > li"
|
|
$albumList = $albumQl->rules($rules)->range($range)->query()->getData()->all();
|
|
foreach ($albumList as $album) {
|
|
if ($countLimit <= 0) {
|
|
dump("相册已超过限制数量,跳出");
|
|
break 2;
|
|
}
|
|
dump("current album page no: " . $i);
|
|
usleep(1000 * random_int(100, 1000));
|
|
dump("相册:", [$album]);
|
|
$title = $album["title"];
|
|
$titleStr = preg_replace("#/#", "-", $title);
|
|
preg_match("#(\d+)P#", $album["num"], $result);
|
|
$totalImageNum = $result[1];
|
|
for ($j = 1; $j <= $totalImageNum; $j++) {
|
|
// $albumCode = substr($album, 25, 5);
|
|
preg_match("#id\=(\d+)#", $album["code"], $albumCodeResult);
|
|
$albumCode = $albumCodeResult[1];
|
|
|
|
$baseImageUrl = "https://tjg.gzhuibei.com/a/1/{$albumCode}/";
|
|
|
|
|
|
// if ($j == 0) {
|
|
// $imageName = $j . ".jpg";
|
|
// } else {
|
|
// $suffix = str_pad($j, 3, "0", STR_PAD_LEFT);
|
|
// $imageName = $suffix . ".jpg";
|
|
// }
|
|
$imageName = $j . ".jpg";
|
|
$imageUrl = $baseImageUrl . $imageName;
|
|
// if ($j == 0) {
|
|
// $imageName = "000.jpg";
|
|
// }
|
|
$imageName = $username . "-" . $albumCode . "-" . $imageName;
|
|
$albumPath = "";
|
|
if (!file_exists($baseDir . $albumCode . "-" . $titleStr) && !array_key_exists($albumCode, $albumCodeMap)) {
|
|
dump($baseDir . $albumCode . "-" . $titleStr);
|
|
mkdir($baseDir . $albumCode . "-" . $titleStr);
|
|
}
|
|
if (array_key_exists($albumCode, $albumCodeMap)) {
|
|
$albumPath = $baseDir . $albumCodeMap[$albumCode];
|
|
} else {
|
|
$albumPath = $baseDir . $albumCode . "-" . $titleStr;
|
|
}
|
|
if (file_exists($albumPath . "/" . $imageName)) {
|
|
dump($albumPath . "/" . $imageName . " exists. skipped!");
|
|
continue;
|
|
}
|
|
|
|
$curl_handle = curl_init();
|
|
|
|
curl_setopt_array($curl_handle, array(
|
|
CURLOPT_URL => $imageUrl,
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_ENCODING => '',
|
|
CURLOPT_MAXREDIRS => 10,
|
|
CURLOPT_TIMEOUT => 0,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
|
|
CURLOPT_CUSTOMREQUEST => 'GET',
|
|
CURLOPT_HTTPHEADER => array(
|
|
'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
|
|
'Accept-Language: zh-CN,zh;q=0.9',
|
|
'Cache-Control: no-cache',
|
|
'Connection: keep-alive',
|
|
'Pragma: no-cache',
|
|
'Referer: https://www.tujidao01.com/',
|
|
'Sec-Fetch-Dest: image',
|
|
'Sec-Fetch-Mode: no-cors',
|
|
'Sec-Fetch-Site: cross-site',
|
|
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36',
|
|
'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
|
|
'sec-ch-ua-mobile: ?0',
|
|
'sec-ch-ua-platform: "macOS"'
|
|
),
|
|
));
|
|
|
|
$query = curl_exec($curl_handle);
|
|
$i = 1;
|
|
while ($query === false) {
|
|
echo 'Curl error: ' . curl_error($curl_handle) . "\n";
|
|
echo "retry times: " . $i++ . " times \n";
|
|
sleep(1);
|
|
$sleepTime = 1000 * random_int(100, 1000);
|
|
echo "retry sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$query = curl_exec($curl_handle);
|
|
if ($i >= 100) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
echo curl_error($curl_handle);
|
|
$fp = fopen($albumPath . "/" . $imageName, 'x');
|
|
fwrite($fp, $query);
|
|
fclose($fp);
|
|
$sleepTime = 1000 * random_int(100, 1000);
|
|
echo "after write image sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
echo $imageUrl;
|
|
}
|
|
// exit;
|
|
$countLimit--;
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
public function getContent($url)
|
|
{
|
|
$curl = curl_init();
|
|
|
|
curl_setopt_array($curl, array(
|
|
CURLOPT_URL => "$url",
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_ENCODING => '',
|
|
CURLOPT_MAXREDIRS => 10,
|
|
CURLOPT_TIMEOUT => 10,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
|
|
CURLOPT_CUSTOMREQUEST => 'GET',
|
|
CURLOPT_HTTPHEADER => array(
|
|
'authority: www.tujidao01.com',
|
|
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
'accept-language: zh-CN,zh;q=0.9',
|
|
'cache-control: max-age=0',
|
|
'cookie: PHPSESSID=ndm118vli42e1db7dfhqmvgjo7; __51vcke__Je64MI06Q1Neac4F=3d9a0d91-cf15-5bf7-ab90-90734f856aba; __51vuft__Je64MI06Q1Neac4F=1654567556100; uid=315696; name=nicksxs; leixing=0; __51uvsct__Je64MI06Q1Neac4F=2; __vtins__Je64MI06Q1Neac4F=%7B%22sid%22%3A%20%22d5d48e8b-a16e-5451-95f4-e629e6a4ec1b%22%2C%20%22vd%22%3A%205%2C%20%22stt%22%3A%20287951%2C%20%22dr%22%3A%205812%2C%20%22expires%22%3A%201654862307975%2C%20%22ct%22%3A%201654860507975%7D',
|
|
'referer: https://www.tujidao01.com/sousu/?s0=%E6%9D%A8%E6%99%A8%E6%99%A8',
|
|
'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
|
|
'sec-ch-ua-mobile: ?0',
|
|
'sec-ch-ua-platform: "macOS"',
|
|
'sec-fetch-dest: document',
|
|
'sec-fetch-mode: navigate',
|
|
'sec-fetch-site: same-origin',
|
|
'sec-fetch-user: ?1',
|
|
'upgrade-insecure-requests: 1',
|
|
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36'
|
|
),
|
|
));
|
|
|
|
$response = curl_exec($curl);
|
|
|
|
|
|
curl_close($curl);
|
|
return $response;
|
|
|
|
}
|
|
|
|
}
|