<?php
|
|
|
|
|
|
namespace App\Services;
|
|
|
|
|
|
use Exception;
|
|
use ErrorException;
|
|
use Log;
|
|
use QL\QueryList;
|
|
|
|
class XiurenjiService
|
|
{
|
|
public $domainUrl = "https://www.xiurenji.com";
|
|
public $xiurenRootUrl = "https://www.xiurenji.com/XiuRen/";
|
|
public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
|
|
public $queryInstance;
|
|
public function __construct()
|
|
{
|
|
$this->queryInstance = QueryList::getInstance();
|
|
}
|
|
|
|
public function scrapeAlbum()
|
|
{
|
|
$pageSize = 20;
|
|
$pageCount = $this->getEncodeHtmlContent("https://www.xiurenji.com/XiuRen/index.html")->find(".page span")->htmls()->get(0);
|
|
print_r($pageCount);
|
|
if ((int)$pageCount > 0) {
|
|
for ($i = 0; $i < $pageCount / $pageSize; $i++) {
|
|
$urlSuffix = "";
|
|
if ($i == 0) {
|
|
$urlSuffix = "index.html";
|
|
} else {
|
|
$urlSuffix = "index" . $i . ".html";
|
|
}
|
|
$this->scrapePageAlbum($this->xiurenRootUrl . $urlSuffix);
|
|
// exit;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
public function scrapePageAlbum($url) {
|
|
|
|
$pageContent = $this->getEncodeHtmlContent($url);
|
|
// dump($pageContent);
|
|
$items = $pageContent->find(".dan a")->getElements();
|
|
foreach ($items as $item) {
|
|
dump($item->getAttribute("href"));
|
|
$this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href"));
|
|
}
|
|
|
|
}
|
|
|
|
public function scrapeSingleAlbum($url) {
|
|
$pageContent = $this->getEncodeHtmlContent($url);
|
|
$pageSize = 3;
|
|
$items = $pageContent->find(".ina p:nth-child(2)")->texts();
|
|
$pageItems = $pageContent->find(".page a:eq(-2)")->htmls();
|
|
if (count($pageItems) <= 0) {
|
|
dump("this album is error: ". $url);
|
|
Log::error("this album is error: " . $url);
|
|
return;
|
|
}
|
|
// dump($pageItems);exit;
|
|
foreach ($items as $item) {
|
|
// $imageNum = substr($item, strrpos($item, "[") + 1, strrpos($item, "P]") - strrpos($item, "[") - 1);
|
|
// $pageCount = ceil($imageNum / $pageSize);
|
|
$pageCount = (int)$pageItems[0];
|
|
// dump("pageCount: ". $pageCount . "");exit;
|
|
$slashPos = strpos($url, "XiuRen/") + 7;
|
|
$dotPos = strrpos($url, ".");
|
|
$albumCode = substr($url, $slashPos, $dotPos - $slashPos);
|
|
// $albumName0 = substr($item, strpos($item, "["), strrpos($item, "]") - strpos($item, "[") + 1);
|
|
$albumName = ltrim(substr($item, 6, strrpos($item, "]") - 5));
|
|
// dump("album compare ", $albumName0, $albumName);exit;
|
|
$this->parseContent($this->rootDir. $albumName, $pageContent);
|
|
dump("albumName: ". $albumName);
|
|
// dump("item: ". $item);
|
|
// exit;
|
|
for ($i = 1; $i < $pageCount; $i++) {
|
|
$pageContent = $this->getEncodeHtmlContent($this->xiurenRootUrl . $albumCode . "_" . $i . ".html");
|
|
$this->parseContent($this->rootDir. $albumName, $pageContent);
|
|
}
|
|
// dump("slashPos: " . $slashPos . " dotPos: " . $dotPos . " albumCode: ". $albumCode);
|
|
// dump($item);
|
|
// exit;
|
|
}
|
|
}
|
|
|
|
public function parseContent($dir, $pageContent)
|
|
{
|
|
if (!is_dir($dir)) {
|
|
mkdir($dir);
|
|
}
|
|
$images = $pageContent->find(".img p img")->getElements();
|
|
// dump($images);exit;
|
|
foreach ($images as $image) {
|
|
usleep(random_int(10, 100) * 100);
|
|
$imageUrl = $image->getAttribute("src");
|
|
$trueImageUrl = "https://x1.plmn5.com/U". substr($imageUrl, 2);
|
|
$fileInfo = pathinfo($trueImageUrl);
|
|
if (file_exists($dir . "/" .$fileInfo["basename"])) {
|
|
continue;
|
|
}
|
|
dump($fileInfo);
|
|
$attempts = 0;
|
|
$content = "";
|
|
do {
|
|
try {
|
|
$content = file_get_contents($trueImageUrl);
|
|
} catch (ErrorException | Exception $e) {
|
|
echo $e->getTraceAsString() . "\n";
|
|
$sleepTime = 10000 * random_int(100, 1000);
|
|
echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$attempts ++;
|
|
continue;
|
|
}
|
|
break;
|
|
} while($attempts < 100);
|
|
if ($content != "") {
|
|
file_put_contents($dir . "/" .$fileInfo["basename"], $content);
|
|
} else {
|
|
Log::error("image content is empty ". $trueImageUrl);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
public function getEncodeHtmlContent($url) {
|
|
$attempts = 0;
|
|
$html = "";
|
|
do {
|
|
try {
|
|
$html = iconv('gb2312','UTF-8//IGNORE',file_get_contents($url));
|
|
} catch (Exception $e) {
|
|
echo $e->getTraceAsString() . "\n";
|
|
$sleepTime = 10000 * random_int(100, 1000);
|
|
echo "sleep {$sleepTime} nano second \n";
|
|
usleep($sleepTime);
|
|
$attempts ++;
|
|
continue;
|
|
}
|
|
break;
|
|
} while($attempts < 100);
|
|
return $this->queryInstance->setHtml($html);
|
|
}
|
|
|
|
}
|