<?php
|
|
|
|
|
|
namespace App\Services;
|
|
|
|
|
|
use QL\QueryList;
|
|
use Illuminate\Http\File;
|
|
use Illuminate\Support\Facades\Storage;
|
|
|
|
class NewXiuGirlsService
|
|
{
|
|
private static $name_dir = [
|
|
"周韦彤" => [
|
|
"dir" => "周韦彤",
|
|
"code" => 16274,
|
|
"name" => "周韦彤"
|
|
],
|
|
"ry" => [
|
|
"dir" => "忍野さら",
|
|
"code" => "21250",
|
|
"name" => "忍野さら"
|
|
],
|
|
"ycc" => [
|
|
"dir" => "ycc",
|
|
"code" => 22162,
|
|
"name" => "杨晨晨"
|
|
],
|
|
"azu" => [
|
|
"dir" => "azu",
|
|
"code" => 26002,
|
|
"name" => "阿朱"
|
|
],
|
|
"xq" => [
|
|
"dir" => "xq",
|
|
"code" => 22204,
|
|
"name" => "小琪"
|
|
],
|
|
"ygh" => [
|
|
"dir" => "ygh",
|
|
"code" => 15902,
|
|
"name" => "原干惠"
|
|
],
|
|
"wyc" => [
|
|
"dir" => "wyc",
|
|
"code" => 19702,
|
|
"name" => "王语纯"
|
|
],
|
|
"zz" => [
|
|
"dir" => "zz",
|
|
"code" => 22899,
|
|
"name" => "芝芝 booty"
|
|
],
|
|
"hlr" => [
|
|
"dir" => "hlr",
|
|
"code" => 20015,
|
|
"name" => "黄乐然"
|
|
],
|
|
"jrq" => [
|
|
"dir" => "jrq",
|
|
"code" => 26560,
|
|
"name" => "姜仁卿"
|
|
],
|
|
"ny" => [
|
|
"dir" => "ny",
|
|
"code" => 26298,
|
|
"name" => "奈月"
|
|
],
|
|
"杉本有美" => [
|
|
"dir" => "杉本有美",
|
|
"code" => 15939,
|
|
"name" => "杉本有美"
|
|
]
|
|
|
|
];
|
|
public function scrapeXiuGirls()
|
|
{
|
|
// 15902 原干惠
|
|
// 22162 杨晨晨
|
|
// 19702 王语纯
|
|
// 22899 芝芝 booty
|
|
// 20015 黄乐然
|
|
// 26560 姜仁卿
|
|
$NUM_OF_ATTEMPTS = 50;
|
|
|
|
|
|
foreach (self::$name_dir as $username => $name) {
|
|
usleep(random_int(1000, 10000) * 1000);
|
|
$peopleUrl = "https://xsnvshen.com/girl/";
|
|
$peopleUrl .= $name['code'];
|
|
|
|
$albumSelector = ".entryAblum > .star-mod-bd > ul > li > a";
|
|
$baseQl = QueryList::get($peopleUrl);
|
|
$ql = $baseQl->find($albumSelector)->attrs("href");
|
|
|
|
$items = $ql->all();
|
|
// $items = array_slice($items, 49);
|
|
// print_r($items);exit;
|
|
|
|
$baseUrl = "https://www.xsnvshen.com";
|
|
// $baseDir = "/Users/shixuesen/Documents/xg/" . $name['dir']. "/";
|
|
$baseDir = "/Volumes/intel660p/image/xg/" . $name['dir'] . "/";
|
|
if (!file_exists($baseDir)) {
|
|
mkdir($baseDir);
|
|
}
|
|
|
|
// $items = array_slice($items, 1);
|
|
// print_r($items);continue;
|
|
foreach ($items as $item) {
|
|
usleep(random_int(1000, 10000) * 1000);
|
|
echo "相册子链接: " .$item . "\n";
|
|
$queryItemUrlTimes = 1;
|
|
do {
|
|
try {
|
|
$html = (new \QL\QueryList)->get($baseUrl . $item);
|
|
} catch (\Exception $e) {
|
|
\Log::error("查询相册子链接失败,将重试, 异常信息: " . $e->getMessage());
|
|
$sleepTime = 1000 * random_int(1000, 10000);
|
|
echo "查询相册子链接失败 sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$queryItemUrlTimes++;
|
|
}
|
|
break;
|
|
} while ($queryItemUrlTimes < 10);
|
|
$title = $html->find("h1 > a")->texts();
|
|
dump($title);
|
|
echo "相册名: " . $title[0] . "\n";
|
|
while (trim($title[0]) == "古诗文") {
|
|
echo "here error happenned \n";
|
|
// dump($html->getHtml());
|
|
usleep(random_int(1, 1000) * 50000);
|
|
$html = QueryList::get($baseUrl. $item);
|
|
$title = $html->find("h1 > a")->texts();
|
|
// continue;
|
|
}
|
|
// break;
|
|
$albumPath = "";
|
|
$albumCode = explode("/", $item)[2];
|
|
if (!file_exists($baseDir .explode("/", $item)[2] . "-" .$title[0])) {
|
|
mkdir($baseDir .explode("/", $item)[2]. "-" .$title[0]);
|
|
}
|
|
$albumPath = $baseDir.explode("/", $item)[2] . "-" . $title[0];
|
|
$attempts = 0;
|
|
$images = [];
|
|
do {
|
|
try {
|
|
$images = QueryList::get($baseUrl . $item)->find(".swi-hd > img")->attrs("src");
|
|
} catch (\Exception $e) {
|
|
echo $e->getTraceAsString() . "\n";
|
|
$sleepTime = 10000 * random_int(100, 1000);
|
|
echo "sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$attempts ++;
|
|
continue;
|
|
}
|
|
break;
|
|
} while ($attempts <= $NUM_OF_ATTEMPTS);
|
|
foreach ($images as $image) {
|
|
if (strpos($image, "thumb_600x900") !== false) {
|
|
$image = str_replace( "thumb_600x900/", "", $image);
|
|
}
|
|
/**
|
|
* todo 修改
|
|
* CommonService::downloadImage($albumPath, "https:" . $image);
|
|
*/
|
|
|
|
if (file_exists($albumPath."/".pathinfo("http:".$image)['filename'].".jpg") || file_exists($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)['filename'].".jpg")) {
|
|
continue;
|
|
}
|
|
$opts = array('http'=> ['header' =>
|
|
"User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n
|
|
Referer:".$baseUrl.$item."\r\n"
|
|
]);
|
|
$curl_handle=curl_init();
|
|
curl_setopt($curl_handle, CURLOPT_URL,"https:".$image);
|
|
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
|
|
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
|
|
curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item);
|
|
$query = curl_exec($curl_handle);
|
|
$i = 1;
|
|
while ($query === false) {
|
|
echo 'Curl error: ' . curl_error($curl_handle) ."\n";
|
|
echo "retry times: " .$i++ ." times \n";
|
|
sleep(1);
|
|
$sleepTime = 1000 * random_int(1000, 10000);
|
|
echo "retry sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$query = curl_exec($curl_handle);
|
|
if ($i >= 100) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
echo curl_error($curl_handle);
|
|
$fp = fopen($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)['filename'].".jpg", 'x');
|
|
fwrite($fp, $query);
|
|
fclose($fp);
|
|
$sleepTime = 1000 * random_int(100, 1000);
|
|
echo "after write image sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
|
|
}
|
|
usleep(1000 * random_int(100, 1000));
|
|
}
|
|
|
|
}
|
|
exit;
|
|
|
|
|
|
|
|
$baseUrl = "https://www.xsnvshen.com/girl/22162";
|
|
$albumSelector = ".entryAblum > .star-mod-bd > ul > li > a";
|
|
$baseQl = QueryList::get($baseUrl);
|
|
$ql = $baseQl->find($albumSelector)->attrs("href");
|
|
|
|
// /*
|
|
// new dir ----
|
|
$titles = $baseQl->find($albumSelector)->attrs("*");
|
|
// print_r($titles->all());exit;
|
|
$baseDir = "/Users/shixuesen/Documents/xg/ycc/";
|
|
|
|
foreach ($titles->all() as $item) {
|
|
if (file_exists($baseDir .explode("/", $item['href'])[2])) {
|
|
echo "old name :".$baseDir .explode("/", $item['href'])[2] . "\n";
|
|
rename($baseDir.explode("/", $item['href'])[2], $baseDir.explode("/", $item['href'])[2] ."-".$item['title']);
|
|
}
|
|
}
|
|
exit;
|
|
// new dir ----
|
|
// */
|
|
|
|
|
|
$items = $ql->all();
|
|
// $items = array_slice($items, 49);
|
|
// print_r($items);exit;
|
|
|
|
$baseUrl = "https://www.xsnvshen.com";
|
|
$baseDir = "/Users/shixuesen/Documents/xg/ycc/";
|
|
// $items = array_slice($items, 1);
|
|
// print_r($items);exit;
|
|
foreach ($items as $item) {
|
|
|
|
print_r($item);
|
|
$html = QueryList::get($baseUrl. $item);
|
|
$title = $html->find("h1 > a")->texts();
|
|
print_r($title);
|
|
// print_r(is_dir($baseDir));exit;
|
|
$albumPath = "";
|
|
// if (count($title) >0 ) {
|
|
// if (!file_exists($baseDir .$title[0])) {
|
|
// mkdir($baseDir .$title[0]);
|
|
// }
|
|
// $albumPath = $baseDir.$title[0];
|
|
// } else {
|
|
if (!file_exists($baseDir .explode("/", $item)[2])) {
|
|
mkdir($baseDir .explode("/", $item)[2]);
|
|
}
|
|
$albumPath = $baseDir.explode("/", $item)[2];
|
|
// }
|
|
$images = QueryList::get($baseUrl . $item)->find(".swi-hd > img")->attrs("src");
|
|
// mkdir("")
|
|
// print_r($images);
|
|
foreach ($images as $image) {
|
|
if (file_exists($albumPath."/".pathinfo("http:".$image)['filename'].".jpg")) {
|
|
continue;
|
|
// unlink($albumPath."/".pathinfo("http:".$image)['filename'].".jpg");
|
|
}
|
|
$opts = array('http'=> ['header' =>
|
|
"User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n
|
|
Referer:".$baseUrl.$item."\r\n"
|
|
]);
|
|
$curl_handle=curl_init();
|
|
curl_setopt($curl_handle, CURLOPT_URL,"https:".$image);
|
|
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
|
|
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
|
|
curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item);
|
|
$query = curl_exec($curl_handle);
|
|
$i = 1;
|
|
while ($query === false) {
|
|
echo 'Curl error: ' . curl_error($curl_handle) ."\n";
|
|
echo "retry times: " .$i++ ." times \n";
|
|
sleep(1);
|
|
$query = curl_exec($curl_handle);
|
|
if ($i >= 100) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
echo curl_error($curl_handle);
|
|
$fp = fopen($albumPath."/".pathinfo("http:".$image)['filename'].".jpg", 'x');
|
|
fwrite($fp, $query);
|
|
fclose($fp);
|
|
// file_put_contents("1.jpg", $query);
|
|
// curl_close($curl_handle);
|
|
// exit;
|
|
|
|
// $context = stream_context_create($opts);
|
|
|
|
// $a = file_get_contents("http:".$image, false, $context);exit;
|
|
// Storage::put($item.pathinfo($image)["filename"], file_get_contents("http:".$image, false, $context));
|
|
|
|
}
|
|
|
|
// exit;
|
|
}
|
|
}
|
|
|
|
public function singleAlbum($iUser = "ycc", $albumOuterCode = 21429)
|
|
{
|
|
$NUM_OF_ATTEMPTS = 50;
|
|
// self::$name_dir[1];
|
|
print_r([self::$name_dir[$iUser]]);
|
|
foreach ([self::$name_dir[$iUser]] as $username => $name) {
|
|
$username = $iUser;
|
|
dump("here {$username}, {$name['code']}");
|
|
usleep(random_int(1000, 10000) * 1000);
|
|
$peopleUrl = "https://xsnvshen.com/girl/";
|
|
$peopleUrl .= $name['code'];
|
|
|
|
$albumSelector = ".entryAblum > .star-mod-bd > ul > li > a";
|
|
$baseQl = QueryList::get($peopleUrl);
|
|
$ql = $baseQl->find($albumSelector)->attrs("href");
|
|
|
|
$items = $ql->all();
|
|
// $items = array_slice($items, 49);
|
|
// print_r($items);exit;
|
|
|
|
$baseUrl = "https://www.xsnvshen.com";
|
|
// $baseDir = "/Users/shixuesen/Documents/xg/" . $name['dir']. "/";
|
|
$baseDir = "/Volumes/intel660p/image/xg/" . $name['dir'] . "/";
|
|
if (!file_exists($baseDir)) {
|
|
mkdir($baseDir);
|
|
}
|
|
|
|
// $items = array_slice($items, 1);
|
|
print_r($items);
|
|
$items = ["/album/21429"];
|
|
foreach ($items as $item) {
|
|
print_r($item);
|
|
if ($item != "/album/" . $albumOuterCode) {
|
|
echo $item . "\t" . "/album/" . $albumOuterCode . "\n";
|
|
echo "000000000000000000";
|
|
continue;
|
|
}
|
|
usleep(random_int(1000, 10000) * 1000);
|
|
echo "相册子链接: " .$item . "\n";
|
|
$queryItemUrlTimes = 1;
|
|
do {
|
|
try {
|
|
$html = (new \QL\QueryList)->get($baseUrl . $item);
|
|
} catch (\Exception $e) {
|
|
\Log::error("查询相册子链接失败,将重试, 异常信息: " . $e->getMessage());
|
|
$sleepTime = 1000 * random_int(1000, 10000);
|
|
echo "查询相册子链接失败 sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$queryItemUrlTimes++;
|
|
}
|
|
break;
|
|
} while ($queryItemUrlTimes < 10);
|
|
$title = $html->find("h1 > a")->texts();
|
|
dump($title);
|
|
echo "相册名: " . $title[0] . "\n";
|
|
while (trim($title[0]) == "古诗文") {
|
|
echo "here error happenned \n";
|
|
// dump($html->getHtml());
|
|
usleep(random_int(1, 1000) * 50000);
|
|
$html = QueryList::get($baseUrl. $item);
|
|
$title = $html->find("h1 > a")->texts();
|
|
// continue;
|
|
}
|
|
// break;
|
|
$albumPath = "";
|
|
$albumCode = explode("/", $item)[2];
|
|
if (!file_exists($baseDir .explode("/", $item)[2] . "-" .$title[0])) {
|
|
mkdir($baseDir .explode("/", $item)[2]. "-" .$title[0]);
|
|
}
|
|
$albumPath = $baseDir.explode("/", $item)[2] . "-" . $title[0];
|
|
$attempts = 0;
|
|
$images = [];
|
|
do {
|
|
try {
|
|
$images = QueryList::get($baseUrl . $item)->find(".swi-hd > img")->attrs("src");
|
|
} catch (\Exception $e) {
|
|
echo $e->getTraceAsString() . "\n";
|
|
$sleepTime = 10000 * random_int(100, 1000);
|
|
echo "sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$attempts ++;
|
|
continue;
|
|
}
|
|
break;
|
|
} while ($attempts <= $NUM_OF_ATTEMPTS);
|
|
foreach ($images as $image) {
|
|
if (strpos($image, "thumb_600x900") !== false) {
|
|
$image = str_replace( "thumb_600x900/", "", $image);
|
|
}
|
|
/**
|
|
* todo 修改
|
|
* CommonService::downloadImage($albumPath, "https:" . $image);
|
|
*/
|
|
|
|
if (file_exists($albumPath."/".pathinfo("http:".$image)['filename'].".jpg") || file_exists($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)['filename'].".jpg")) {
|
|
continue;
|
|
}
|
|
$opts = array('http'=> ['header' =>
|
|
"User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n
|
|
Referer:".$baseUrl.$item."\r\n"
|
|
]);
|
|
$curl_handle=curl_init();
|
|
curl_setopt($curl_handle, CURLOPT_URL,"https:".$image);
|
|
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
|
|
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
|
|
curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item);
|
|
$query = curl_exec($curl_handle);
|
|
$i = 1;
|
|
while ($query === false) {
|
|
echo 'Curl error: ' . curl_error($curl_handle) ."\n";
|
|
echo "retry times: " .$i++ ." times \n";
|
|
sleep(1);
|
|
$sleepTime = 1000 * random_int(1000, 10000);
|
|
echo "retry sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$query = curl_exec($curl_handle);
|
|
if ($i >= 100) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
echo curl_error($curl_handle);
|
|
$fp = fopen($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)['filename'].".jpg", 'x');
|
|
fwrite($fp, $query);
|
|
fclose($fp);
|
|
$sleepTime = 1000 * random_int(100, 1000);
|
|
echo "after write image sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
|
|
}
|
|
usleep(1000 * random_int(100, 1000));
|
|
}
|
|
|
|
}
|
|
exit;
|
|
}
|
|
}
|