You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

270 lines
11 KiB

<?php
namespace App\Services;
use GuzzleHttp\Exception\ConnectException;
use Illuminate\Support\Arr;
use QL\QueryList;
use Illuminate\Http\File;
use Illuminate\Support\Facades\Storage;
class NewNvshenService
{
private static $name_dir = [
// "周韦彤" => [
// "dir" => "周韦彤",
// "code" => 16274,
// "name" => "周韦彤"
// ],
// "ycc" => [
// "dir" => "ycc",
// "code" => 22162,
// "name" => "杨晨晨"
// ],
// "ry" => [
// "dir" => "忍野さら",
// "code" => "21250",
// "name" => "忍野さら"
// ],
"azu" => [
"dir" => "azu",
"code" => 26002,
"name" => "阿朱"
],
"xq" => [
"dir" => "xq",
"code" => 22204,
"name" => "小琪"
],
"ygh" => [
"dir" => "ygh",
"code" => 15902,
"name" => "原干惠"
],
"wyc" => [
"dir" => "wyc",
"code" => 19702,
"name" => "王语纯"
],
"zz" => [
"dir" => "zz",
"code" => 22899,
"name" => "芝芝 booty"
],
"hlr" => [
"dir" => "hlr",
"code" => 20015,
"name" => "黄乐然"
],
"jrq" => [
"dir" => "jrq",
"code" => 26560,
"name" => "姜仁卿"
],
"ny" => [
"dir" => "ny",
"code" => 26298,
"name" => "奈月"
],
"杉本有美" => [
"dir" => "杉本有美",
"code" => 15939,
"name" => "杉本有美"
]
];
public function scrapeNvshenGirls()
{
// 15902 原干惠
// 22162 杨晨晨
// 19702 王语纯
// 22899 芝芝 booty
// 20015 黄乐然
// 26560 姜仁卿
$NUM_OF_ATTEMPTS = 50;
$mainQl = QueryList::getInstance();
foreach (self::$name_dir as $username => $name) {
// $baseDir = "/Users/shixuesen/Documents/tmp/image/xg/" . $name['dir'] . "/";
$baseDir = "/Volumes/intel660p/image/xg/" . $name["dir"] . "/";
// 处理两个网站对应图册名不一致,直接取图册 id 作为唯一性判断的 map
if (is_dir($baseDir)) {
$albumDirList = scandir($baseDir);
foreach ($albumDirList as $albumDir) {
if ($albumDir == "." || $albumDir == "..") {
continue;
}
if (is_dir($baseDir . $albumDir ) && is_numeric(explode("-", $albumDir)[0])) {
$albumCodeMap[explode("-", $albumDir)[0]] = $albumDir;
}
}
}
$baseUrl = "https://www.nvshens.net";
usleep(random_int(1000, 10000) * 1000);
$peopleUrl = "https://www.nvshens.net/girl/";
$peopleUrl .= $name['code'];
// 获取总的相册数量
$albumNumSelector = ".archive_more > a";
for ($i = 0; $i < 100; $i++) {
try {
$baseQl =$mainQl->get($peopleUrl,[], ['maxTry' => 5]);
break;
} catch (ConnectException $e) {
echo 'connection error: ' . $e->getMessage() . "\n";
echo "retry times: " . $i . " times \n";
sleep(1);
$sleepTime = 1000 * random_int(1000, 10000);
echo "retry sleep {$sleepTime} nano second \n";
usleep($sleepTime);
}
}
$ql = $baseQl->find($albumNumSelector)->htmls();
$onlyOnePage = false;
// 相册页数
if (count($ql->all()) == 0) {
// 不超过 1 页时,不显示数量
$totalAlbumPage = 1;
$onlyOnePage = true;
} else {
preg_match("#\d+#", $ql->all()[0], $result);
dump($result);
$totalAlbumNum = 0;
if (is_numeric($result[0])) {
$totalAlbumNum = $result[0];
}
$totalAlbumPage = ceil($totalAlbumNum / 30);
}
$baseAlbumUrl = "https://www.nvshens.net/girl/{$name['code']}/album/";
for ($i = 1; $i <= $totalAlbumPage; $i++) {
dump("current album page no: " . $i);
if ($onlyOnePage) {
$albumQl = $mainQl->get($peopleUrl);
} else {
$albumQl = $mainQl->get($baseAlbumUrl . $i . ".html");
}
// dump($albumQl->getHtml());
$albumList = $albumQl->find(".igalleryli > .igalleryli_div > .igalleryli_link")->attrs("href");
$pageAlbum = $albumList->all();
// if ($i == 4) {
// $pageAlbum = array_slice($pageAlbum, 1);
// }
// $pageAlbum = array_slice($pageAlbum, 19);
// dump($pageAlbum);exit;
foreach ($pageAlbum as $album) {
usleep(10000 * random_int(1000, 10000));
dump("相册:", [$album]);
// $pageQL = $qL0->get($baseUrl . $album);
for ($m = 0; $m < 100; $m++) {
try {
$page = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find(".albumInfo > span")->htmls();
$title = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find(".albumTitle > #htilte")->htmls();
break;
} catch (ConnectException $e) {
echo 'connection error: ' . $e->getMessage() . "\n";
echo "retry times: " . $m . " times \n";
sleep(1);
$sleepTime = 1000 * random_int(1000, 10000);
echo "retry sleep {$sleepTime} nano second \n";
usleep($sleepTime);
}
}
dump($title->all());
$titleStr = $title->all()[0];
preg_match("#\d+#", $page->all()[0], $result);
$totalImageNum = $result[0];
for ($j = 0; $j < $totalImageNum; $j++) {
$albumCode = substr($album, 3, 5);
$baseImageUrl = "https://t1.onvshen.com:85/gallery/{$name['code']}/{$albumCode}/";
if ($j == 0) {
$imageName = $j . ".jpg";
} else {
$suffix = str_pad($j, 3, "0", STR_PAD_LEFT);
$imageName = $suffix . ".jpg";
}
$imageUrl = $baseImageUrl . $imageName;
if ($j == 0) {
$imageName = "000.jpg";
}
$imageName = $username . "-" . $albumCode . "-" . $imageName;
$albumPath = "";
if (!file_exists($baseDir . $albumCode . "-" . $titleStr) && !array_key_exists($albumCode, $albumCodeMap)) {
dump($baseDir . $albumCode . "-" . $titleStr);
mkdir($baseDir . $albumCode . "-" . $titleStr);
}
if (array_key_exists($albumCode, $albumCodeMap)) {
$albumPath = $baseDir . $albumCodeMap[$albumCode];
} else {
$albumPath = $baseDir . $albumCode . "-" . $titleStr;
}
if (file_exists($albumPath . "/" . $imageName)) {
dump($albumPath . "/" . $imageName . " exists. skipped!");
continue;
}
$opts = array('http' => ['header' =>
"User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n
Referer:" . $baseUrl . $album . "\r\n"
]);
$curl_handle = curl_init();
curl_setopt($curl_handle, CURLOPT_URL, $imageUrl);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $album);
$query = curl_exec($curl_handle);
$i = 1;
while ($query === false) {
echo 'Curl error: ' . curl_error($curl_handle) . "\n";
echo "retry times: " . $i++ . " times \n";
sleep(1);
$sleepTime = 1000 * random_int(1000, 10000);
echo "retry sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$query = curl_exec($curl_handle);
if ($i >= 100) {
break;
}
}
echo curl_error($curl_handle);
$fp = fopen($albumPath . "/" . $imageName, 'x');
fwrite($fp, $query);
fclose($fp);
$sleepTime = 1000 * random_int(100, 1000);
echo "after write image sleep {$sleepTime} nano second \n";
usleep($sleepTime);
echo $imageUrl;
}
dump($page->all());
// exit;
}
// "https://img.onvshen.com:85/gallery/22162/31696/0.jpg"
// exit;
usleep(1000 * random_int(100, 1000));
}
usleep(1000 * random_int(100, 1000));
// exit;
}
usleep(1000 * random_int(100, 1000));
}
public function subTest()
{
$baseUrl = "https://www.nvshens.net";
$album = "/g/31347/";
$pageQL = QueryList::get($baseUrl . $album);
$page = $pageQL->find(".albumInfo > span")->htmls();
echo $page;
$title = $pageQL->find(".albumTitle > #htilte")->htmls();
echo $title;
}
}