You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

454 lines
19 KiB

<?php
namespace App\Services;
use QL\QueryList;
use Illuminate\Http\File;
use Illuminate\Support\Facades\Storage;
use App\Utils\FileUtils;
class NewXiuGirlsService
{
private static $name_dir = [
"周韦彤" => [
"dir" => "周韦彤",
"code" => 16274,
"name" => "周韦彤"
],
"ry" => [
"dir" => "忍野さら",
"code" => "21250",
"name" => "忍野さら"
],
"ycc" => [
"dir" => "ycc",
"code" => 22162,
"name" => "杨晨晨"
],
"azu" => [
"dir" => "azu",
"code" => 26002,
"name" => "阿朱"
],
"xq" => [
"dir" => "xq",
"code" => 22204,
"name" => "小琪"
],
"ygh" => [
"dir" => "ygh",
"code" => 15902,
"name" => "原干惠"
],
"wyc" => [
"dir" => "wyc",
"code" => 19702,
"name" => "王语纯"
],
"zz" => [
"dir" => "zz",
"code" => 22899,
"name" => "芝芝 booty"
],
"hlr" => [
"dir" => "hlr",
"code" => 20015,
"name" => "黄乐然"
],
"jrq" => [
"dir" => "jrq",
"code" => 26560,
"name" => "姜仁卿"
],
"ny" => [
"dir" => "ny",
"code" => 26298,
"name" => "奈月"
],
"杉本有美" => [
"dir" => "杉本有美",
"code" => 15939,
"name" => "杉本有美"
]
];
public function scrapeXiuGirls()
{
// 15902 原干惠
// 22162 杨晨晨
// 19702 王语纯
// 22899 芝芝 booty
// 20015 黄乐然
// 26560 姜仁卿
$NUM_OF_ATTEMPTS = 50;
foreach (self::$name_dir as $username => $name) {
usleep(random_int(1000, 10000) * 1000);
$peopleUrl = "https://xsnvshen.com/girl/";
$peopleUrl .= $name['code'];
$albumSelector = ".entryAblum > .star-mod-bd > ul > li > a";
$baseQl = QueryList::get($peopleUrl);
$ql = $baseQl->find($albumSelector)->attrs("href");
$items = $ql->all();
// $items = array_slice($items, 49);
// print_r($items);exit;
$baseUrl = "https://www.xsnvshen.com";
// $baseDir = "/Users/shixuesen/Documents/xg/" . $name['dir']. "/";
$baseDir = "/Volumes/intel660p/image/xg/" . $name['dir'] . "/";
if (!file_exists($baseDir)) {
mkdir($baseDir);
}
// 扫描目录下已有的相册目录,以相册标号作为 key,目录作为值返回
$baseMap = FileUtils::scanBaseDir($baseDir);
// $items = array_slice($items, 1);
// print_r($items);continue;
foreach ($items as $item) {
usleep(random_int(1000, 10000) * 1000);
echo "相册子链接: " .$item . "\n";
$queryItemUrlTimes = 1;
do {
try {
$html = (new \QL\QueryList)->get($baseUrl . $item);
} catch (\Exception $e) {
\Log::error("查询相册子链接失败,将重试, 异常信息: " . $e->getMessage());
$sleepTime = 1000 * random_int(1000, 10000);
echo "查询相册子链接失败 sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$queryItemUrlTimes++;
}
break;
} while ($queryItemUrlTimes < 10);
$title = $html->find("h1 > a")->texts();
dump($title);
echo "相册名: " . $title[0] . "\n";
while (trim($title[0]) == "古诗文") {
echo "here error happenned \n";
// dump($html->getHtml());
usleep(random_int(1, 1000) * 50000);
$html = QueryList::get($baseUrl. $item);
$title = $html->find("h1 > a")->texts();
// continue;
}
// break;
$albumPath = "";
$albumCode = explode("/", $item)[2];
if (array_key_exists($albumCode, $baseMap)) {
$albumPath = $baseMap[$albumCode];
}
if ($albumPath == "" && !file_exists($baseDir .explode("/", $item)[2] . "-" .$title[0])) {
$albumPath = $baseDir .explode("/", $item)[2]. "-" .$title[0];
mkdir($albumPath);
}
$attempts = 0;
$images = [];
do {
try {
$images = QueryList::get($baseUrl . $item)->find(".swi-hd > img")->attrs("src");
} catch (\Exception $e) {
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts ++;
continue;
}
break;
} while ($attempts <= $NUM_OF_ATTEMPTS);
foreach ($images as $image) {
if (strpos($image, "thumb_600x900") !== false) {
$image = str_replace( "thumb_600x900/", "", $image);
}
/**
* todo 修改
* CommonService::downloadImage($albumPath, "https:" . $image);
*/
if (file_exists($albumPath."/".pathinfo("http:".$image)['filename'].".jpg") || file_exists($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)['filename'].".jpg")) {
continue;
}
$opts = array('http'=> ['header' =>
"User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n
Referer:".$baseUrl.$item."\r\n"
]);
$curl_handle=curl_init();
curl_setopt($curl_handle, CURLOPT_URL,"https:".$image);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item);
$query = curl_exec($curl_handle);
$i = 1;
while ($query === false) {
echo 'Curl error: ' . curl_error($curl_handle) ."\n";
echo "retry times: " .$i++ ." times \n";
sleep(1);
$sleepTime = 1000 * random_int(1000, 10000);
echo "retry sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$query = curl_exec($curl_handle);
if ($i >= 100) {
break;
}
}
echo curl_error($curl_handle);
$fp = fopen($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)['filename'].".jpg", 'x');
fwrite($fp, $query);
fclose($fp);
$sleepTime = 1000 * random_int(100, 1000);
echo "after write image sleep {$sleepTime} nano second \n";
usleep($sleepTime);
}
usleep(1000 * random_int(100, 1000));
}
}
exit;
$baseUrl = "https://www.xsnvshen.com/girl/22162";
$albumSelector = ".entryAblum > .star-mod-bd > ul > li > a";
$baseQl = QueryList::get($baseUrl);
$ql = $baseQl->find($albumSelector)->attrs("href");
// /*
// new dir ----
$titles = $baseQl->find($albumSelector)->attrs("*");
// print_r($titles->all());exit;
$baseDir = "/Users/shixuesen/Documents/xg/ycc/";
foreach ($titles->all() as $item) {
if (file_exists($baseDir .explode("/", $item['href'])[2])) {
echo "old name :".$baseDir .explode("/", $item['href'])[2] . "\n";
rename($baseDir.explode("/", $item['href'])[2], $baseDir.explode("/", $item['href'])[2] ."-".$item['title']);
}
}
exit;
// new dir ----
// */
$items = $ql->all();
// $items = array_slice($items, 49);
// print_r($items);exit;
$baseUrl = "https://www.xsnvshen.com";
$baseDir = "/Users/shixuesen/Documents/xg/ycc/";
// $items = array_slice($items, 1);
// print_r($items);exit;
foreach ($items as $item) {
print_r($item);
$html = QueryList::get($baseUrl. $item);
$title = $html->find("h1 > a")->texts();
print_r($title);
// print_r(is_dir($baseDir));exit;
$albumPath = "";
// if (count($title) >0 ) {
// if (!file_exists($baseDir .$title[0])) {
// mkdir($baseDir .$title[0]);
// }
// $albumPath = $baseDir.$title[0];
// } else {
if (!file_exists($baseDir .explode("/", $item)[2])) {
mkdir($baseDir .explode("/", $item)[2]);
}
$albumPath = $baseDir.explode("/", $item)[2];
// }
$images = QueryList::get($baseUrl . $item)->find(".swi-hd > img")->attrs("src");
// mkdir("")
// print_r($images);
foreach ($images as $image) {
if (file_exists($albumPath."/".pathinfo("http:".$image)['filename'].".jpg")) {
continue;
// unlink($albumPath."/".pathinfo("http:".$image)['filename'].".jpg");
}
$opts = array('http'=> ['header' =>
"User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n
Referer:".$baseUrl.$item."\r\n"
]);
$curl_handle=curl_init();
curl_setopt($curl_handle, CURLOPT_URL,"https:".$image);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item);
$query = curl_exec($curl_handle);
$i = 1;
while ($query === false) {
echo 'Curl error: ' . curl_error($curl_handle) ."\n";
echo "retry times: " .$i++ ." times \n";
sleep(1);
$query = curl_exec($curl_handle);
if ($i >= 100) {
break;
}
}
echo curl_error($curl_handle);
$fp = fopen($albumPath."/".pathinfo("http:".$image)['filename'].".jpg", 'x');
fwrite($fp, $query);
fclose($fp);
// file_put_contents("1.jpg", $query);
// curl_close($curl_handle);
// exit;
// $context = stream_context_create($opts);
// $a = file_get_contents("http:".$image, false, $context);exit;
// Storage::put($item.pathinfo($image)["filename"], file_get_contents("http:".$image, false, $context));
}
// exit;
}
}
public function singleAlbum($iUser = "ycc", $albumOuterCode = 21429)
{
$NUM_OF_ATTEMPTS = 50;
// self::$name_dir[1];
print_r([self::$name_dir[$iUser]]);
foreach ([self::$name_dir[$iUser]] as $username => $name) {
$username = $iUser;
dump("here {$username}, {$name['code']}");
usleep(random_int(1000, 10000) * 1000);
$peopleUrl = "https://xsnvshen.com/girl/";
$peopleUrl .= $name['code'];
$albumSelector = ".entryAblum > .star-mod-bd > ul > li > a";
$baseQl = QueryList::get($peopleUrl);
$ql = $baseQl->find($albumSelector)->attrs("href");
$items = $ql->all();
// $items = array_slice($items, 49);
// print_r($items);exit;
$baseUrl = "https://www.xsnvshen.com";
// $baseDir = "/Users/shixuesen/Documents/xg/" . $name['dir']. "/";
$baseDir = "/Volumes/intel660p/image/xg/" . $name['dir'] . "/";
if (!file_exists($baseDir)) {
mkdir($baseDir);
}
// $items = array_slice($items, 1);
print_r($items);
$items = ["/album/21429"];
foreach ($items as $item) {
print_r($item);
if ($item != "/album/" . $albumOuterCode) {
echo $item . "\t" . "/album/" . $albumOuterCode . "\n";
echo "000000000000000000";
continue;
}
usleep(random_int(1000, 10000) * 1000);
echo "相册子链接: " .$item . "\n";
$queryItemUrlTimes = 1;
do {
try {
$html = (new \QL\QueryList)->get($baseUrl . $item);
} catch (\Exception $e) {
\Log::error("查询相册子链接失败,将重试, 异常信息: " . $e->getMessage());
$sleepTime = 1000 * random_int(1000, 10000);
echo "查询相册子链接失败 sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$queryItemUrlTimes++;
}
break;
} while ($queryItemUrlTimes < 10);
$title = $html->find("h1 > a")->texts();
dump($title);
echo "相册名: " . $title[0] . "\n";
while (trim($title[0]) == "古诗文") {
echo "here error happenned \n";
// dump($html->getHtml());
usleep(random_int(1, 1000) * 50000);
$html = QueryList::get($baseUrl. $item);
$title = $html->find("h1 > a")->texts();
// continue;
}
// break;
$albumPath = "";
$albumCode = explode("/", $item)[2];
if (!file_exists($baseDir .explode("/", $item)[2] . "-" .$title[0])) {
mkdir($baseDir .explode("/", $item)[2]. "-" .$title[0]);
}
$albumPath = $baseDir.explode("/", $item)[2] . "-" . $title[0];
$attempts = 0;
$images = [];
do {
try {
$images = QueryList::get($baseUrl . $item)->find(".swi-hd > img")->attrs("src");
} catch (\Exception $e) {
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts ++;
continue;
}
break;
} while ($attempts <= $NUM_OF_ATTEMPTS);
foreach ($images as $image) {
if (strpos($image, "thumb_600x900") !== false) {
$image = str_replace( "thumb_600x900/", "", $image);
}
/**
* todo 修改
* CommonService::downloadImage($albumPath, "https:" . $image);
*/
if (file_exists($albumPath."/".pathinfo("http:".$image)['filename'].".jpg") || file_exists($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)['filename'].".jpg")) {
continue;
}
$opts = array('http'=> ['header' =>
"User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n
Referer:".$baseUrl.$item."\r\n"
]);
$curl_handle=curl_init();
curl_setopt($curl_handle, CURLOPT_URL,"https:".$image);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item);
$query = curl_exec($curl_handle);
$i = 1;
while ($query === false) {
echo 'Curl error: ' . curl_error($curl_handle) ."\n";
echo "retry times: " .$i++ ." times \n";
sleep(1);
$sleepTime = 1000 * random_int(1000, 10000);
echo "retry sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$query = curl_exec($curl_handle);
if ($i >= 100) {
break;
}
}
echo curl_error($curl_handle);
$fp = fopen($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)['filename'].".jpg", 'x');
fwrite($fp, $query);
fclose($fp);
$sleepTime = 1000 * random_int(100, 1000);
echo "after write image sleep {$sleepTime} nano second \n";
usleep($sleepTime);
}
usleep(1000 * random_int(100, 1000));
}
}
exit;
}
}