@ -4,6 +4,7 @@
namespace App\Services ;
use App\ImageRecord ;
use Exception ;
use ErrorException ;
use Log ;
@ -11,97 +12,124 @@ use QL\QueryList;
class XiurenjiService
{
public $domainUrl = " https://www.xiurenji.net " ;
public $xiurenRootUrl = " https://www.xiurenji.net/XiuRen/ " ;
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
// public $rootDir = "/Volumes/Backup/images/xiuren/";
public $rootDir = " /Volumes/intel660p/image/xiuren/ " ;
private $name_dir = [
" xiuren " => [
" path " => " XiuRen/ " ,
" dir " => " xiuren/ "
],
" xiaoyu " => [
" path " => " XiaoYu/ " ,
" dir " => " xiaoyu/ "
],
" youwu " => [
" path " => " YouWu/ " ,
" dir " => " youwu/ "
],
" mygirl " => [
" path " => " MyGirl/ " ,
" dir " => " mygirl/ "
],
" huayang " => [
" path " => " HuaYang/ " ,
" dir " => " huayang/ "
],
" mfstar " => [
" path " => " MFStar/ " ,
" dir " => " mfstar/ "
],
" imiss " => [
" path " => " IMiss/ " ,
" dir " => " imiss/ "
]
];
public $domainUrl = " https://www.xiurenb.net/ " ;
public $xiurenRootUrl = " https://www.xiurenb.net/XiuRen/ " ;
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
// public $rootDir = "/Volumes/Backup/images/xiuren/";
public $rootDir = " /Volumes/Crucial X6/Image/xr/ " ;
public $queryInstance ;
public $queryNew ;
public function __construct ()
{
$this -> queryInstance = QueryList :: getInstance ();
$this -> queryNew = new QueryList ();
}
public function scrapeAlbum ()
public function scrapeAll () {
foreach ( $this -> name_dir as $key => $value ) {
# code...
dump ( " current site: " . $key );
$this -> scrapeAlbum ( $key , 20 );
}
// $this->scrapeAlbum("xiuren", 20);
// $this->scrapeAlbum("xiaoyu", 20);
// $this->scrapeAlbum("youwu", 20);
// $this->scrapeAlbum("mygirl", 20);
// $this->scrapeAlbum("huayang", 20);
// $this->scrapeAlbum("mfstar", 20);
// $this->scrapeAlbum("imiss", 20);
}
public function scrapeAlbum ( $path , $num = 20 , $startPage = 0 )
{
$pageSize = 20 ;
$pageCount = $this -> getEncodeHtmlContent ( " https://www.xiurenji.net/XiuRen/index.html " ) -> find ( " .page span " ) -> htmls () -> get ( 0 );
print_r ( $pageCount );
$urlPath = $this -> name_dir [ $path ][ " path " ];
$rootDir = $this -> rootDir ;
$this -> rootDir = $this -> rootDir . $this -> name_dir [ $path ][ " dir " ];
$pageCount = $this -> getEncodeHtmlContent ( " https://www.xiurenb.net/ $urlPath /index.html " ) -> find ( " .page span strong " ) -> htmls () -> get ( 0 );
dump ( " current site item count: " . $pageCount );
if (( int ) $pageCount > 0 ) {
$pageCount = 100 ;
for ( $i = 0 ; $i <= ceil ( $pageCount / $pageSize ); $i ++ ) {
$pageCount = min ( $pageCount , $num ) ;
for ( $i = $startPage ; $i <= ceil ( $pageCount / $pageSize ); $i ++ ) {
$urlSuffix = " " ;
if ( $i == 0 ) {
$urlSuffix = " index.html " ;
} else {
$urlSuffix = " index " . $i . " .html " ;
}
$this -> scrapePageAlbum ( $this -> xiurenRootUrl . $urlSuffix );
// exit;
$this -> scrapePageAlbum ( $this -> domainUrl . $urlPath . $urlSuffix );
}
}
$this -> rootDir = $rootDir ;
}
public function scrapePageAlbum ( $url ) {
public function scrapePageAlbum ( $url )
{
$pageContent = $this -> getEncodeHtmlContent ( $url );
// dump($pageContent);
$items = $pageContent -> find ( " .dan a " ) -> getElements () ;
$items = $pageContent -> find ( " .i_list a " ) -> getElements ();
$i = 0 ;
foreach ( $items as $item ) {
// $i++;
// if ($i < 18) {
// continue;
// # code...
// }
dump ( $item -> getAttribute ( " href " ));
$this -> scrapeSingleAlbum ( $this -> domainUrl . $item -> getAttribute ( " href " ));
}
}
public function scrapeSingleAlbum ( $url ) {
public function scrapeSingleAlbum ( $url )
{
Log :: info ( " scrapeSingleAlbum $url " );
$pageContent = $this -> getEncodeHtmlContent ( $url );
$pageSize = 3 ;
$items = $pageContent -> find ( " .ina p:nth-child(2) " ) -> texts ();
$pageItems = $pageContent -> find ( " .page a:eq(-2) " ) -> htmls ();
$isSinglePage = false ;
$pageCount = 0 ;
if ( count ( $pageItems ) <= 0 ) {
$isSinglePage = true ;
$pageCount = 1 ;
// dump("this album is error: ". $url);
// Log::error("this album is error: " . $url);
// return;
$albumName = $pageContent -> find ( " .item_title h1 " ) -> htmls () -> get ( 0 );
$pageItems = $pageContent -> find ( " .content:eq(0) .page a " ) -> attrs ( " href " ) -> all ();
if ( $this -> checkAlbumHasDownload ( $albumName )) {
Log :: info ( " 已经下载过了,相册名: " . $albumName );
return ;
}
dump ( " 当前相册名: " . $albumName );
$imageNo = 1 ;
$this -> parseContent ( $this -> rootDir . $albumName , $pageContent , $imageNo );
$pageItems = array_slice ( $pageItems , 2 , count ( $pageItems ) - 3 );
foreach ( $pageItems as $item ) {
$pageContent = $this -> getEncodeHtmlContent ( $this -> domainUrl . $item );
$this -> parseContent ( $this -> rootDir . $albumName , $pageContent , $imageNo );
}
// dump($pageItems);exit;
$item = $items [ 0 ];
// $imageNum = substr($item, strrpos($item, "[") + 1, strrpos($item, "P]") - strrpos($item, "[") - 1);
// $pageCount = ceil($imageNum / $pageSize);
$pageCount = $pageCount > 0 ? $pageCount : ( int ) $pageItems [ 0 ];
// dump("pageCount: ". $pageCount . "");exit;
$slashPos = strpos ( $url , " XiuRen/ " ) + 7 ;
$dotPos = strrpos ( $url , " . " );
$albumCode = substr ( $url , $slashPos , $dotPos - $slashPos );
// $albumName0 = substr($item, strpos($item, "["), strrpos($item, "]") - strpos($item, "[") + 1);
$albumName = ltrim ( substr ( $item , 6 , strrpos ( $item , " ] " ) - 5 ));
if ( mb_strlen ( $albumName ) <= 12 || mb_strlen ( $albumName ) >= 50 || ! str_contains ( $albumName , " No " )) {
dump ( " old Album: " . $albumName );
$albumName = $pageContent -> find ( " .ina p b:nth-child(2) " ) -> texts ();
// $albumName = ltrim(substr($item, 6, strrpos(substr($item, 0, strrpos($item, "@")), " ") - 5));
dump ( " new Album: " . urldecode ( $albumName [ 0 ]));
$albumName = urldecode ( $albumName [ 0 ]);
}
$imageNo = 1 ;
$this -> parseContent ( $this -> rootDir . $albumName , $pageContent , $imageNo );
dump ( " albumName: " . $albumName );
// dump("item: ". $item);
// exit;
for ( $i = 1 ; $i < $pageCount ; $i ++ ) {
$pageContent = $this -> getEncodeHtmlContent ( $this -> xiurenRootUrl . $albumCode . " _ " . $i . " .html " );
$this -> parseContent ( $this -> rootDir . $albumName , $pageContent , $imageNo );
}
// dump("slashPos: " . $slashPos . " dotPos: " . $dotPos . " albumCode: ". $albumCode);
// dump($item);
// exit;
}
public function parseContent ( $dir , $pageContent , & $imageNo )
@ -114,43 +142,38 @@ class XiurenjiService
return ;
}
}
$images = $pageContent -> find ( " .img p img " ) -> getElements ();
$user = $pageContent -> find ( " .title_pc tr:eq(2) td a:eq(2) " ) -> html ();
// $user = $this->queryNew->find(".ina a:eq(-1) b")->html();
// body > div.nr3 > table.title_pc > tbody > tr:nth-child(3) > td > a:nth-child(2)
$user = iconv_gbk_to_uft8 ( $user );
dump ( " user is " . $user );
if ( str_contains ( $user , " # " ) || str_contains ( $user , " & " ) || trim ( $user ) == " " ) {
$user = $this -> queryNew -> find ( " .ina a:eq(-1) b " ) -> html ();;
$user = iconv_gbk_to_uft8 ( $user );
dump ( " new user is " . $user );
$images = $pageContent -> find ( " .content p img " ) -> getElements ();
$user = $pageContent -> find ( " .item_info div a:eq(-1) span " ) -> htmls () -> get ( 0 );
if ( $imageNo == 1 ) {
// 只在每个相册第一次输出名字
dump ( " user is " . $user );
}
foreach ( $images as $image ) {
usleep ( random_int ( 10 , 100 ) * 100 );
$imageUrl = $image -> getAttribute ( " src " );
$trueImageUrl = " https://x1.plmn5.com/U " . substr ( $imageUrl , 2 );
$trueImageUrl = " https://www.xiurenji.net " . $imageUrl ;
$trueImageUrl = " https://www.xiurenji.net " . $imageUrl ;
$fileInfo = pathinfo ( $trueImageUrl );
if ( file_exists ( $dir . " / " . $fileInfo [ " basename " ])) {
rename ( $dir . " / " . $fileInfo [ " basename " ], $dir . " / " . $imageNo . " - " . $fileInfo [ " basename " ]);
if ( file_exists ( $dir . DIRECTORY_SEPARATOR . $fileInfo [ " basename " ])) {
rename ( $dir . DIRECTORY_SEPARATOR . $fileInfo [ " basename " ], $dir . DIRECTORY_SEPARATOR . $imageNo . " - " . $fileInfo [ " basename " ]);
$imageNo ++ ;
continue ;
}
if ( file_exists ( $dir . " / " . $imageNo . " - " . $fileInfo [ " basename " ])) {
rename ( $dir . " / " . $imageNo . " - " . $fileInfo [ " basename " ], $dir . " / " . trim ( $user ) . " - " . $imageNo . " - " . $fileInfo [ " basename " ]);
if ( file_exists ( $dir . DIRECTORY_SEPARATOR . $imageNo . " - " . $fileInfo [ " basename " ])) {
rename ( $dir . DIRECTORY_SEPARATOR . $imageNo . " - " . $fileInfo [ " basename " ], $dir . DIRECTORY_SEPARATOR . trim ( $user ) . " - " . $imageNo . " - " . $fileInfo [ " basename " ]);
$imageNo ++ ;
continue ;
}
if ( file_exists ( $dir . " / " . trim ( $user ) . " - " . $imageNo . " - " . $fileInfo [ " basename " ])) {
if ( file_exists ( $dir . DIRECTORY_SEPARATOR . trim ( $user ) . " - " . $imageNo . " - " . $fileInfo [ " basename " ])) {
$imageNo ++ ;
continue ;
}
dump ( $fileInfo );
// dump($fileInfo);
$attempts = 0 ;
$content = " " ;
do {
try {
$curl_handle = curl_init ();
$curl_handle = curl_init ();
curl_setopt ( $curl_handle , CURLOPT_URL , $trueImageUrl );
curl_setopt ( $curl_handle , CURLOPT_CONNECTTIMEOUT , 20000 );
curl_setopt ( $curl_handle , CURLOPT_RETURNTRANSFER , 1 );
@ -162,21 +185,21 @@ class XiurenjiService
curl_setopt ( $curl_handle , CURLOPT_TIMEOUT , 0 );
curl_setopt ( $curl_handle , CURLOPT_HTTP_VERSION , CURL_HTTP_VERSION_1_1 );
curl_setopt ( $curl_handle , CURLOPT_CUSTOMREQUEST , 'GET' );
curl_setopt ( $curl_handle , CURLOPT_HTTPHEADER , array (
'authority: www.xiurenji.net' ,
'pragma: no-cache' ,
'cache-control: no-cache' ,
'sec-ch-ua: "Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"' ,
'sec-ch-ua-mobile: ?0' ,
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36' ,
'sec-ch-ua-platform: "macOS"' ,
'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8' ,
'sec-fetch-site: same-origin' ,
'sec-fetch-mode: no-cors' ,
'sec-fetch-dest: image' ,
'referer: https://www.xiurenji.net/XiuRen/9483.html' ,
'accept-language: zh-CN,zh;q=0.9' ,
'cookie: UM_distinctid=17cfa8bea8eb9e-0dd0c6d032d0fc-1c306851-13c680-17cfa8bea8fc85; CNZZDATA1278618868=1505121253-1636283360-%7C1636283360; __51cke__=; ASPSESSIONIDQAQAATSQ=LBLGNPMDHKKMNOPDBCEAPIMH; __tins__20641871=%7B%22sid%22%3A%201636291046220%2C%20%22vd%22%3A%202%2C%20%22expires%22%3A%201636292852634%7D; __51laig__=2'
curl_setopt ( $curl_handle , CURLOPT_HTTPHEADER , array (
'authority: www.xiurenji.net' ,
'pragma: no-cache' ,
'cache-control: no-cache' ,
'sec-ch-ua: "Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"' ,
'sec-ch-ua-mobile: ?0' ,
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36' ,
'sec-ch-ua-platform: "macOS"' ,
'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8' ,
'sec-fetch-site: same-origin' ,
'sec-fetch-mode: no-cors' ,
'sec-fetch-dest: image' ,
'referer: https://www.xiurenji.net/XiuRen/9483.html' ,
'accept-language: zh-CN,zh;q=0.9' ,
'cookie: UM_distinctid=17cfa8bea8eb9e-0dd0c6d032d0fc-1c306851-13c680-17cfa8bea8fc85; CNZZDATA1278618868=1505121253-1636283360-%7C1636283360; __51cke__=; ASPSESSIONIDQAQAATSQ=LBLGNPMDHKKMNOPDBCEAPIMH; __tins__20641871=%7B%22sid%22%3A%201636291046220%2C%20%22vd%22%3A%202%2C%20%22expires%22%3A%201636292852634%7D; __51laig__=2'
));
$content = curl_exec ( $curl_handle );
if ( $content === false ) {
@ -185,36 +208,36 @@ class XiurenjiService
throw $le ;
}
curl_close ( $curl_handle );
// $content = file_get_contents($trueImageUrl);
} catch ( ErrorException | Exception $e ) {
// $content = file_get_contents($trueImageUrl);
} catch ( ErrorException | Exception $e ) {
echo $e -> getTraceAsString () . " \n " ;
$sleepTime = 10000 * random_int ( 100 , 1000 );
echo " wait for $trueImageUrl sleep { $sleepTime } nano second \n " ;
usleep ( $sleepTime );
$attempts ++ ;
$attempts ++ ;
continue ;
}
break ;
} while ( $attempts < 100 );
} while ( $attempts < 100 );
if ( $content != " " ) {
file_put_contents ( $dir . " / " . trim ( $user ) . " - " . $imageNo . " - " . $fileInfo [ " basename " ], $content );
file_put_contents ( $dir . DIRECTORY_SEPARATOR . trim ( $user ) . " - " . $imageNo . " - " . $fileInfo [ " basename " ], $content );
} else {
Log :: error ( " image content is empty " . $trueImageUrl );
Log :: error ( " image content is empty " . $trueImageUrl );
}
$imageNo ++ ;
// dump("current imageNo: " . $imageNo);
// dump("current imageNo: " . $imageNo);
}
}
public function getEncodeHtmlContent ( $url ) {
public function getEncodeHtmlContent ( $url )
{
$attempts = 0 ;
$html = " " ;
$arrContextOptions = array (
" ssl " => array (
" allow_self_signed " => true ,
" verify_peer " => false ,
" verify_peer_name " => false ,
$arrContextOptions = array (
" ssl " => array (
" allow_self_signed " => true ,
" verify_peer " => false ,
" verify_peer_name " => false ,
),
);
do {
@ -233,7 +256,7 @@ class XiurenjiService
CURLOPT_SSL_VERIFYPEER => false ,
CURLOPT_SSL_VERIFYHOST => false ,
CURLOPT_HTTPHEADER => array (
CURLOPT_HTTPHEADER => array (
'authority: www.xiurenji.net' ,
'pragma: no-cache' ,
'cache-control: no-cache' ,
@ -257,23 +280,32 @@ class XiurenjiService
echo $error ;
curl_close ( $curl );
// echo $response;
// $html = iconv('gb2312','UTF-8//IGNORE', $response);
// echo $response;
// $html = iconv('gb2312','UTF-8//IGNORE', $response);
$html = iconv_gbk_to_uft8 ( $response );
$this -> queryNew -> setHtml ( $response );
// $html = $response;
// $html = $response;
} catch ( Exception $e ) {
echo $e -> getMessage () . " \n " ;
echo $e -> getMessage () . " \n " ;
echo $e -> getTraceAsString () . " \n " ;
$sleepTime = 10000 * random_int ( 100 , 1000 );
echo " sleep { $sleepTime } nano second \n " ;
usleep ( $sleepTime );
$attempts ++ ;
$attempts ++ ;
continue ;
}
break ;
} while ( $attempts < 100 );
} while ( $attempts < 100 );
// dump("current url: " . $url);
return $this -> queryInstance -> setHtml ( $html );
}
private function checkAlbumHasDownload ( $albumName )
{
$record = ImageRecord :: where ( " name " , $albumName ) -> first ();
if ( $record != null ) {
return true ;
}
}
}