You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

493 lines
18 KiB

<?php
namespace App\Services;
use App\XiuGirl;
use Exception;
use Log;
use QL\QueryList;
use Illuminate\Http\File;
use Illuminate\Support\Facades\Storage;
use App\Utils\FileUtils;
use App\Utils\CommonUtils;
class NewXiuGirlsService
{
private $queryInstance;
public function __construct()
{
$this->queryInstance = QueryList::getInstance();
}
/**
* @return QueryList|null
*/
public function getQueryInstance(): ?QueryList
{
return $this->queryInstance;
}
private static $name_dir = [
"周韦彤" => [
"dir" => "周韦彤",
"code" => 16274,
"name" => "周韦彤"
],
"ry" => [
"dir" => "忍野さら",
"code" => "21250",
"name" => "忍野さら"
],
"ycc" => [
"dir" => "ycc",
"code" => 22162,
"name" => "杨晨晨"
],
"azu" => [
"dir" => "azu",
"code" => 26002,
"name" => "阿朱"
],
"xq" => [
"dir" => "xq",
"code" => 22204,
"name" => "小琪"
],
"ygh" => [
"dir" => "ygh",
"code" => 15902,
"name" => "原干惠"
],
"wyc" => [
"dir" => "wyc",
"code" => 19702,
"name" => "王语纯"
],
"zz" => [
"dir" => "zz",
"code" => 22899,
"name" => "芝芝 booty"
],
"hlr" => [
"dir" => "hlr",
"code" => 20015,
"name" => "黄乐然"
],
"jrq" => [
"dir" => "jrq",
"code" => 26560,
"name" => "姜仁卿"
],
"ny" => [
"dir" => "ny",
"code" => 26298,
"name" => "奈月"
],
"杉本有美" => [
"dir" => "杉本有美",
"code" => 15939,
"name" => "杉本有美"
]
];
public function scrapeXiuGirls()
{
// 15902 原干惠
// 22162 杨晨晨
// 19702 王语纯
// 22899 芝芝 booty
// 20015 黄乐然
// 26560 姜仁卿
$NUM_OF_ATTEMPTS = 50;
// self::$name_dir = array_slice(self::$name_dir, 8, 10);
foreach (self::$name_dir as $username => $name) {
$items = $this->queryAllAlbum($name);
// $items = array_slice($items, 49);
// print_r($items);exit;
$baseUrl = "https://www.xsnvshen.com";
// $baseDir = "/Users/shixuesen/Documents/xg/" . $name["dir"]. "/";
$baseDir = "/Volumes/intel660p/image/xg/" . $name["dir"] . "/";
if (!file_exists($baseDir)) {
mkdir($baseDir);
}
// 扫描目录下已有的相册目录,以相册标号作为 key,目录作为值返回
$baseMap = FileUtils::scanBaseDir($baseDir);
// $items = array_slice($items, 1);
// print_r($items);continue;
foreach ($items as $item) {
CommonUtils::randomSleep(100);
echo "相册子链接: " .$item . "\n";
$title = $this->getAlbumTitle($baseUrl, $item);
// break;
$albumPath = "";
$albumCode = explode("/", $item)[2];
$albumPath = $this->generateAlbumPath($albumPath, $albumCode, $baseMap, $baseDir, $item, $title[0]);
$attempts = 0;
$images = $this->queryImages($baseUrl, $item, $attempts, $NUM_OF_ATTEMPTS);
foreach ($images as $image) {
if (strpos($image, "thumb_600x900") !== false) {
$image = str_replace( "thumb_600x900/", "", $image);
}
/**
* todo 修改
* CommonService::downloadImage($albumPath, "https:" . $image);
*/
if (!$this->checkIfNeedDownload($image, $username, $albumCode, $name, $albumPath)){
continue;
}
$opts = array("http"=> ["header" =>
"User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n
Referer:".$baseUrl.$item."\r\n"
]);
$this->processDownloadImage($image, $baseUrl, $item, $albumPath, $username, $albumCode);
}
usleep(1000 * random_int(100, 1000));
}
}
}
public function singleAlbum($iUser = "ycc", $albumOuterCode = 21429)
{
$NUM_OF_ATTEMPTS = 50;
// self::$name_dir[1];
print_r([self::$name_dir[$iUser]]);
foreach ([self::$name_dir[$iUser]] as $username => $name) {
$username = $iUser;
dump("here {$username}, {$name["code"]}");
usleep(random_int(1000, 10000) * 1000);
$peopleUrl = "https://xsnvshen.com/girl/";
$peopleUrl .= $name["code"];
$albumSelector = ".entryAblum > .star-mod-bd > ul > li > a";
$baseQl = QueryList::get($peopleUrl);
$ql = $baseQl->find($albumSelector)->attrs("href");
$items = $ql->all();
// $items = array_slice($items, 49);
// print_r($items);exit;
$baseUrl = "https://www.xsnvshen.com";
// $baseDir = "/Users/shixuesen/Documents/xg/" . $name["dir"]. "/";
$baseDir = "/Volumes/intel660p/image/xg/" . $name["dir"] . "/";
if (!file_exists($baseDir)) {
mkdir($baseDir);
}
// $items = array_slice($items, 1);
print_r($items);
$items = ["/album/21429"];
foreach ($items as $item) {
print_r($item);
if ($item != "/album/" . $albumOuterCode) {
echo $item . "\t" . "/album/" . $albumOuterCode . "\n";
echo "000000000000000000";
continue;
}
usleep(random_int(1000, 10000) * 1000);
echo "相册子链接: " .$item . "\n";
$queryItemUrlTimes = 1;
do {
try {
$html = (new QueryList)->get($baseUrl . $item);
} catch (Exception $e) {
Log::error("查询相册子链接失败,将重试, 异常信息: " . $e->getMessage());
$sleepTime = 1000 * random_int(1000, 10000);
echo "查询相册子链接失败 sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$queryItemUrlTimes++;
}
break;
} while ($queryItemUrlTimes < 10);
$title = $html->find("h1 > a")->texts();
dump($title);
echo "相册名: " . $title[0] . "\n";
while (trim($title[0]) == "古诗文") {
echo "here error happenned \n";
// dump($html->getHtml());
usleep(random_int(1, 1000) * 50000);
$html = QueryList::get($baseUrl. $item);
$title = $html->find("h1 > a")->texts();
// continue;
}
// break;
$albumPath = "";
$albumCode = explode("/", $item)[2];
if (!file_exists($baseDir .explode("/", $item)[2] . "-" .$title[0])) {
mkdir($baseDir .explode("/", $item)[2]. "-" .$title[0]);
}
$albumPath = $baseDir.explode("/", $item)[2] . "-" . $title[0];
$attempts = 0;
$images = [];
do {
try {
$images = QueryList::get($baseUrl . $item)->find(".swi-hd > img")->attrs("src");
} catch (Exception $e) {
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts ++;
continue;
}
break;
} while ($attempts <= $NUM_OF_ATTEMPTS);
foreach ($images as $image) {
if (strpos($image, "thumb_600x900") !== false) {
$image = str_replace( "thumb_600x900/", "", $image);
}
/**
* todo 修改
* CommonService::downloadImage($albumPath, "https:" . $image);
*/
if (file_exists($albumPath."/".pathinfo("http:".$image)["filename"].".jpg") || file_exists($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)["filename"].".jpg")) {
continue;
}
$opts = array("http"=> ["header" =>
"User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n
Referer:".$baseUrl.$item."\r\n"
]);
$curl_handle=curl_init();
curl_setopt($curl_handle, CURLOPT_URL,"https:".$image);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36");
curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item);
$query = curl_exec($curl_handle);
$i = 1;
while ($query === false) {
echo "Curl error: " . curl_error($curl_handle) ."\n";
echo "retry times: " .$i++ ." times \n";
sleep(1);
$sleepTime = 1000 * random_int(1000, 10000);
echo "retry sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$query = curl_exec($curl_handle);
if ($i >= 100) {
break;
}
}
echo curl_error($curl_handle);
$fp = fopen($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:".$image)["filename"].".jpg", "x");
fwrite($fp, $query);
fclose($fp);
$sleepTime = 1000 * random_int(100, 1000);
echo "after write image sleep {$sleepTime} nano second \n";
usleep($sleepTime);
}
usleep(1000 * random_int(100, 1000));
}
}
exit;
}
/**
* xiugirl网站个人页展示的是全部的相册,不用分页查
* @param $name
* @return array
*/
private function queryAllAlbum($name): array
{
CommonUtils::randomSleep(100);
$peopleUrl = "https://xsnvshen.com/girl/";
$peopleUrl .= $name["code"];
$albumSelector = ".entryAblum > .star-mod-bd > ul > li > a";
$baseQl = $this->getQueryInstance()->get($peopleUrl);
$ql = $baseQl->find($albumSelector)->attrs("href");
$items = $ql->all();
return $items;
}
/**
* 获取相册名
* @param string $baseUrl
* @param $item
* @return array
* @throws Exception
*/
private function getAlbumTitle(string $baseUrl, $item)
{
$queryItemUrlTimes = 1;
do {
try {
$html = $this->getQueryInstance()->get($baseUrl . $item);
} catch (Exception $e) {
Log::error("查询相册子链接失败,将重试, 异常信息: " . $e->getMessage());
$sleepTime = 1000 * random_int(1000, 10000);
echo "查询相册子链接失败 sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$queryItemUrlTimes++;
continue;
}
break;
} while ($queryItemUrlTimes < 10);
$title = $html->find("h1 > a")->texts();
dump($title);
echo "相册名: " . $title[0] . "\n";
while (trim($title[0]) == "古诗文") {
echo "here error happenned \n";
// dump($html->getHtml());
usleep(random_int(1, 1000) * 50000);
$html = $this->getQueryInstance()->get($baseUrl . $item);
$title = $html->find("h1 > a")->texts();
// continue;
}
return $title;
}
/**
* @param string $albumPath
* @param string $albumCode
* @param array $baseMap
* @param string $baseDir
* @param $item
* @param $title
* @return mixed|string
*/
private function generateAlbumPath(string $albumPath, string $albumCode, array $baseMap, string $baseDir, $item, $title): string
{
if (array_key_exists($albumCode, $baseMap)) {
$albumPath = $baseMap[$albumCode];
}
if ($albumPath == "" && !file_exists($baseDir . explode("/", $item)[2] . "-" . $title)) {
$albumPath = $baseDir . explode("/", $item)[2] . "-" . $title;
mkdir($albumPath);
}
return $albumPath;
}
/**
* 获取相册的所有图片链接
* @param string $baseUrl
* @param $item
* @param int $attempts
* @param int $NUM_OF_ATTEMPTS
* @return array
* @throws Exception
*/
private function queryImages(string $baseUrl, $item, int $attempts, int $NUM_OF_ATTEMPTS): array
{
$images = [];
do {
try {
$images = $this->getQueryInstance()->get($baseUrl . $item)->find(".swi-hd > img")->attrs("src");
} catch (Exception $e) {
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts++;
continue;
}
break;
} while ($attempts <= $NUM_OF_ATTEMPTS);
return $images->all();
}
/**
* @param array $image
* @param string $username
* @param string $albumCode
* @param $name
* @param string $albumPath
* @return bool
*/
private function checkIfNeedDownload(string $image, string $username, string $albumCode, $name, string $albumPath): bool
{
$fileInfo = pathinfo("http:" . $image);
$imageFileName = $username . "-" . $albumCode . "-" . $fileInfo["filename"] . ".jpg";
$images = XiuGirl::where([
"user_id" => $name["code"],
"album_id" => $albumCode,
"image_name" => $imageFileName
])->get();
if ($images->count() > 0) {
if ($this->imageFileExists($albumPath, $image, $username, $albumCode)) {
return false;
} else {
return true;
}
} else {
if ($this->imageFileExists($albumPath, $image, $username, $albumCode)) {
$xiuGirl = new XiuGirl;
$xiuGirl->user_id = $name["code"];
$xiuGirl->album_id = $albumCode;
$xiuGirl->image_name = $username . "-" . $albumCode . "-" . $fileInfo["filename"] . ".jpg";
$xiuGirl->site_from = "xiugirl";
$xiuGirl->is_downloaded = 1;
$xiuGirl->save();
return false;
} else {
return true;
}
}
}
/**
* @param string $albumPath
* @param array $image
* @param string $username
* @param string $albumCode
* @return bool
*/
private function imageFileExists(string $albumPath, string $image, string $username, string $albumCode): bool
{
return file_exists($albumPath . "/" . pathinfo("http:" . $image)["filename"] . ".jpg") || file_exists($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:" . $image)["filename"] . ".jpg");
}
/**
* @param array $image
* @param string $baseUrl
* @param $item
* @param string $albumPath
* @param string $username
* @param string $albumCode
* @throws Exception
*/
private function processDownloadImage(string $image, string $baseUrl, $item, string $albumPath, string $username, string $albumCode): void
{
$curl_handle = curl_init();
curl_setopt($curl_handle, CURLOPT_URL, "https:" . $image);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36");
curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item);
$content = curl_exec($curl_handle);
$i = 1;
while ($content === false) {
echo "Curl error: " . curl_error($curl_handle) . "\n";
echo "retry times: " . $i++ . " times \n";
sleep(1);
$sleepTime = 1000 * random_int(1000, 10000);
echo "retry sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$content = curl_exec($curl_handle);
if ($i >= 100) {
break;
}
}
echo curl_error($curl_handle);
$fp = fopen($albumPath . "/" . $username . "-" . $albumCode . "-" . pathinfo("http:" . $image)["filename"] . ".jpg", "x");
fwrite($fp, $content);
fclose($fp);
$sleepTime = 1000 * random_int(100, 1000);
echo "after write image sleep {$sleepTime} nano second \n";
usleep($sleepTime);
}
}