<?php
|
|
|
|
|
|
namespace App\Services;
|
|
|
|
|
|
use QL\QueryList;
|
|
|
|
class TujiguService
|
|
{
|
|
|
|
private static $name_dir = [
|
|
// "周韦彤" => [
|
|
// "dir" => "周韦彤",
|
|
// "code" => 16274,
|
|
// "name" => "周韦彤"
|
|
// ],
|
|
// "ycc" => [
|
|
// "dir" => "ycc",
|
|
// "code" => 22162,
|
|
// "name" => "杨晨晨"
|
|
// ],
|
|
// "ry" => [
|
|
// "dir" => "忍野さら",
|
|
// "code" => "21250",
|
|
// "name" => "忍野さら"
|
|
// ],
|
|
// "azu" => [
|
|
// "dir" => "azu",
|
|
// "code" => 26002,
|
|
// "name" => "阿朱"
|
|
// ],
|
|
// "xq" => [
|
|
// "dir" => "xq",
|
|
// "code" => 22204,
|
|
// "name" => "小琪"
|
|
// ],
|
|
// "ygh" => [
|
|
// "dir" => "ygh",
|
|
// "code" => 15902,
|
|
// "name" => "原干惠"
|
|
// ],
|
|
// "wyc" => [
|
|
// "dir" => "wyc",
|
|
// "code" => 19702,
|
|
// "name" => "王语纯"
|
|
// ],
|
|
// "zz" => [
|
|
// "dir" => "zz",
|
|
// "code" => 22899,
|
|
// "name" => "芝芝 booty"
|
|
// ],
|
|
// "hlr" => [
|
|
// "dir" => "hlr",
|
|
// "code" => 20015,
|
|
// "name" => "黄乐然"
|
|
// ],
|
|
"jrq" => [
|
|
"dir" => "jrq",
|
|
"code" => 5034,
|
|
"name" => "姜仁卿"
|
|
],
|
|
// "ny" => [
|
|
// "dir" => "ny",
|
|
// "code" => 26298,
|
|
// "name" => "奈月"
|
|
// ],
|
|
// "杉本有美" => [
|
|
// "dir" => "杉本有美",
|
|
// "code" => 15939,
|
|
// "name" => "杉本有美"
|
|
// ]
|
|
|
|
];
|
|
|
|
public function scrapeTujiguGirls()
|
|
{
|
|
foreach (self::$name_dir as $username => $name) {
|
|
// $baseDir = "/Users/shixuesen/Documents/tmp/image/xg/" . $name['dir'] . "/";
|
|
$baseDir = "/Volumes/intel660p/image/xg/" . $name["dir"] . "/";
|
|
// 处理两个网站对应图册名不一致,直接取图册 id 作为唯一性判断的 map
|
|
if (is_dir($baseDir)) {
|
|
$albumDirList = scandir($baseDir);
|
|
foreach ($albumDirList as $albumDir) {
|
|
if ($albumDir == "." || $albumDir == "..") {
|
|
continue;
|
|
}
|
|
if (is_dir($baseDir . $albumDir) && is_numeric(explode("-", $albumDir)[0])) {
|
|
$albumCodeMap[explode("-", $albumDir)[0]] = $albumDir;
|
|
}
|
|
}
|
|
}
|
|
$baseUrl = "https://www.tujigu.com/";
|
|
// usleep(random_int(1000, 10000) * 1000);
|
|
$peopleUrl = "https://www.tujigu.com/t/";
|
|
$peopleUrl .= $name['code'];
|
|
// 获取总的相册数量
|
|
$albumNumSelector = "body > div:nth-child(4) > span";
|
|
$baseQl = QueryList::get($peopleUrl);
|
|
$ql = $baseQl->find($albumNumSelector)->htmls();
|
|
dump($ql->all());
|
|
$onlyOnePage = false;
|
|
// 相册页数
|
|
if (count($ql->all()) == 0) {
|
|
// 不超过 1 页时,不显示数量
|
|
$totalAlbumPage = 1;
|
|
$onlyOnePage = true;
|
|
} else {
|
|
preg_match("#\d+#", $ql->all()[0], $result);
|
|
dump($result);
|
|
$totalAlbumNum = 0;
|
|
if (is_numeric($result[0])) {
|
|
$totalAlbumNum = $result[0];
|
|
}
|
|
$totalAlbumPage = ceil($totalAlbumNum / 40);
|
|
}
|
|
$baseAlbumUrl = "https://www.tujigu.com/t/{$name['code']}/";
|
|
for ($i = 1; $i <= $totalAlbumPage; $i++) {
|
|
if ($onlyOnePage || $totalAlbumPage == 1) {
|
|
$albumQl = QueryList::get($peopleUrl);
|
|
} else {
|
|
$albumQl = QueryList::get($baseAlbumUrl . $i . ".html");
|
|
}
|
|
// dump($albumQl->getHtml());
|
|
$albumList = $albumQl->find("body > div.hezi > ul > li > a")->attrs("href");
|
|
$pageAlbum = $albumList->all();
|
|
dump($pageAlbum);
|
|
foreach ($pageAlbum as $album) {
|
|
usleep(1000 * random_int(1000, 10000));
|
|
dump("相册:", [$album]);
|
|
$pageQL = QueryList::get( $album);
|
|
$page = $pageQL->find("body > div.tuji > p:nth-child(6)")->htmls();
|
|
$title = $pageQL->find("body > div.tuji > div.weizhi > h1")->htmls();
|
|
dump($title->all());
|
|
$titleStr = $title->all()[0];
|
|
preg_match("#\d+#", $page->all()[0], $result);
|
|
$totalImageNum = $result[0];
|
|
for ($j = 1; $j <= $totalImageNum; $j++) {
|
|
$albumCode = substr($album, 25, 5);
|
|
$baseImageUrl = "https://lns.hywly.com/a/1/{$albumCode}/";
|
|
// if ($j == 0) {
|
|
// $imageName = $j . ".jpg";
|
|
// } else {
|
|
// $suffix = str_pad($j, 3, "0", STR_PAD_LEFT);
|
|
// $imageName = $suffix . ".jpg";
|
|
// }
|
|
$imageName = $j . ".jpg";
|
|
$imageUrl = $baseImageUrl . $imageName;
|
|
// if ($j == 0) {
|
|
// $imageName = "000.jpg";
|
|
// }
|
|
$imageName = $username . "-" . $albumCode . "-" . $imageName;
|
|
$albumPath = "";
|
|
if (!file_exists($baseDir . $albumCode . "-" . $titleStr) && !array_key_exists($albumCode, $albumCodeMap)) {
|
|
dump($baseDir . $albumCode . "-" . $titleStr);
|
|
mkdir($baseDir . $albumCode . "-" . $titleStr);
|
|
}
|
|
if (array_key_exists($albumCode, $albumCodeMap)) {
|
|
$albumPath = $baseDir . $albumCodeMap[$albumCode];
|
|
} else {
|
|
$albumPath = $baseDir . $albumCode . "-" . $titleStr;
|
|
}
|
|
if (file_exists($albumPath . "/" . $imageName)) {
|
|
dump($albumPath . "/" . $imageName . " exists. skipped!");
|
|
continue;
|
|
}
|
|
$opts = array('http' => ['header' =>
|
|
"User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n
|
|
Referer:" . $baseUrl . "\r\n"
|
|
]);
|
|
$curl_handle = curl_init();
|
|
curl_setopt($curl_handle, CURLOPT_URL, $imageUrl);
|
|
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
|
|
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
|
|
curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl);
|
|
$query = curl_exec($curl_handle);
|
|
$i = 1;
|
|
while ($query === false) {
|
|
echo 'Curl error: ' . curl_error($curl_handle) . "\n";
|
|
echo "retry times: " . $i++ . " times \n";
|
|
sleep(1);
|
|
$sleepTime = 1000 * random_int(1000, 10000);
|
|
echo "retry sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$query = curl_exec($curl_handle);
|
|
if ($i >= 100) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
echo curl_error($curl_handle);
|
|
$fp = fopen($albumPath . "/" . $imageName, 'x');
|
|
fwrite($fp, $query);
|
|
fclose($fp);
|
|
$sleepTime = 1000 * random_int(100, 1000);
|
|
echo "after write image sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
echo $imageUrl;
|
|
}
|
|
dump($page->all());
|
|
// exit;
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|