You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

188 lines
7.4 KiB

<?php
namespace App\Services;
use Exception;
use ErrorException;
use Log;
use QL\QueryList;
class XiurenjiService
{
public $domainUrl = "https://www.xiurenji.cc";
public $xiurenRootUrl = "https://www.xiurenji.cc/XiuRen/";
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
public $rootDir = "/Volumes/Backup/images/xiuren/";
// public $rootDir = "/Volumes/intel660p/image/xiuren/";
public $queryInstance;
public function __construct()
{
$this->queryInstance = QueryList::getInstance();
}
public function scrapeAlbum()
{
$pageSize = 20;
$pageCount = $this->getEncodeHtmlContent("https://www.xiurenji.cc/XiuRen/index.html")->find(".page span")->htmls()->get(0);
print_r($pageCount);
if ((int)$pageCount > 0) {
$pageCount = 40;
for ($i = 0; $i <= ceil($pageCount / $pageSize); $i++) {
$urlSuffix = "";
if ($i == 0) {
$urlSuffix = "index.html";
} else {
$urlSuffix = "index" . $i . ".html";
}
$this->scrapePageAlbum($this->xiurenRootUrl . $urlSuffix);
// exit;
}
}
}
public function scrapePageAlbum($url) {
$pageContent = $this->getEncodeHtmlContent($url);
// dump($pageContent);
$items = $pageContent->find(".dan a")->getElements();
foreach ($items as $item) {
dump($item->getAttribute("href"));
$this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href"));
}
}
public function scrapeSingleAlbum($url) {
$pageContent = $this->getEncodeHtmlContent($url);
$pageSize = 3;
$items = $pageContent->find(".ina p:nth-child(2)")->texts();
$pageItems = $pageContent->find(".page a:eq(-2)")->htmls();
if (count($pageItems) <= 0) {
dump("this album is error: ". $url);
Log::error("this album is error: " . $url);
return;
}
// dump($pageItems);exit;
foreach ($items as $item) {
// $imageNum = substr($item, strrpos($item, "[") + 1, strrpos($item, "P]") - strrpos($item, "[") - 1);
// $pageCount = ceil($imageNum / $pageSize);
$pageCount = (int)$pageItems[0];
// dump("pageCount: ". $pageCount . "");exit;
$slashPos = strpos($url, "XiuRen/") + 7;
$dotPos = strrpos($url, ".");
$albumCode = substr($url, $slashPos, $dotPos - $slashPos);
// $albumName0 = substr($item, strpos($item, "["), strrpos($item, "]") - strpos($item, "[") + 1);
$albumName = ltrim(substr($item, 6, strrpos($item, "]") - 5));
if (mb_strlen($albumName) <= 12 || mb_strlen($albumName) >= 50 || !str_contains($albumName, "No")) {
dump("old Album: " . $albumName);
$albumName = $pageContent->find(".ina p b:nth-child(2)")->texts();
// $albumName = ltrim(substr($item, 6, strrpos(substr($item, 0, strrpos($item, "@")), " ") - 5));
dump("new Album: " . urldecode($albumName[0]));
$albumName = urldecode($albumName[0]);
}
$imageNo = 1;
$this->parseContent($this->rootDir. $albumName, $pageContent, $imageNo);
dump("albumName: ". $albumName);
// dump("item: ". $item);
// exit;
for ($i = 1; $i < $pageCount; $i++) {
$pageContent = $this->getEncodeHtmlContent($this->xiurenRootUrl . $albumCode . "_" . $i . ".html");
$this->parseContent($this->rootDir. $albumName, $pageContent, $imageNo);
}
// dump("slashPos: " . $slashPos . " dotPos: " . $dotPos . " albumCode: ". $albumCode);
// dump($item);
// exit;
}
}
public function parseContent($dir, $pageContent, &$imageNo)
{
if (!is_dir($dir)) {
try {
mkdir($dir);
} catch (Exception $e) {
Log::error($e->getTraceAsString());
return;
}
}
$images = $pageContent->find(".img p img")->getElements();
// dump($images);exit;
foreach ($images as $image) {
usleep(random_int(10, 100) * 100);
$imageUrl = $image->getAttribute("src");
$trueImageUrl = "https://x1.plmn5.com/U". substr($imageUrl, 2);
$trueImageUrl = "https://www.xiurenji.cc" . $imageUrl;
$fileInfo = pathinfo($trueImageUrl);
if (file_exists($dir . "/" .$fileInfo["basename"])) {
rename($dir . "/" . $fileInfo["basename"], $dir . "/" . $imageNo . "-" . $fileInfo["basename"]);
$imageNo++;
continue;
}
if (file_exists($dir . "/" . $imageNo . "-" . $fileInfo["basename"])) {
$imageNo++;
continue;
}
dump($fileInfo);
$attempts = 0;
$content = "";
do {
try {
$curl_handle=curl_init();
curl_setopt($curl_handle, CURLOPT_URL, $trueImageUrl);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
curl_setopt($curl_handle, CURLOPT_REFERER, $this->xiurenRootUrl);
curl_setopt($curl_handle, CURLOPT_FOLLOWLOCATION, true);
$content = curl_exec($curl_handle);
if ($content === false) {
$le = new Exception("get image has error: " . curl_error($curl_handle));
curl_close($curl_handle);
throw $le;
}
curl_close($curl_handle);
// $content = file_get_contents($trueImageUrl);
} catch (ErrorException | Exception $e) {
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts ++;
continue;
}
break;
} while($attempts < 100);
if ($content != "") {
file_put_contents($dir . "/" . $imageNo . "-" . $fileInfo["basename"], $content);
} else {
Log::error("image content is empty ". $trueImageUrl);
}
$imageNo++;
// dump("current imageNo: " . $imageNo);
}
}
public function getEncodeHtmlContent($url) {
$attempts = 0;
$html = "";
do {
try {
$html = iconv('gb2312','UTF-8//IGNORE',file_get_contents($url));
} catch (Exception $e) {
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts ++;
continue;
}
break;
} while($attempts < 100);
return $this->queryInstance->setHtml($html);
}
}