From 0ab8ee0e47c0d451e81b484d858f28967de19f76 Mon Sep 17 00:00:00 2001 From: nicksxs Date: Sat, 27 Feb 2021 20:40:54 +0800 Subject: [PATCH] add scrape comic album --- .idea/thriftCompiler.xml | 6 --- app/Console/Commands/ComicsScrape.php | 2 +- app/Services/ComicsService.php | 53 ++++++++++++++++++++++----- 3 files changed, 45 insertions(+), 16 deletions(-) delete mode 100644 .idea/thriftCompiler.xml diff --git a/.idea/thriftCompiler.xml b/.idea/thriftCompiler.xml deleted file mode 100644 index 7bc123c..0000000 --- a/.idea/thriftCompiler.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/app/Console/Commands/ComicsScrape.php b/app/Console/Commands/ComicsScrape.php index 8713d73..f0673a4 100644 --- a/app/Console/Commands/ComicsScrape.php +++ b/app/Console/Commands/ComicsScrape.php @@ -39,7 +39,7 @@ class ComicsScrape extends Command public function handle() { $comic = new ComicsService(); - $comic->scrapeAllAlbum("https://www.003004.com/papa/page/1"); + $comic->scrapeAllAlbum("https://www.003004.com/papa"); // } } diff --git a/app/Services/ComicsService.php b/app/Services/ComicsService.php index 5d0d078..16ed259 100644 --- a/app/Services/ComicsService.php +++ b/app/Services/ComicsService.php @@ -3,11 +3,14 @@ namespace App\Services; use App\Utils\CommonUtils; +use Illuminate\Support\Arr; use QL\QueryList; -class ComicsService { +class ComicsService +{ private $queryInstance; + public function __construct() { $this->queryInstance = QueryList::getInstance(); @@ -25,18 +28,50 @@ class ComicsService { { $content = $this->getQueryInstance()->get($rootUrl); $rules = [ - // 采集文章标题 - 'title' => ['.thumbnail','title'], - // 采集链接 - 'link' => ['.thumbnail','href'], - // 采集缩略图 - 'totalNum' => ['body > section > div.content-wrap > div > div.pagination.pagination-multi > ul > li:nth-child(8) > span','text'], -]; + // 采集文章标题 + 'title' => ['.thumbnail', 'title'], + // 采集链接 + 'link' => ['.thumbnail', 'href'], + // 采集缩略图 + 'totalNum' => ['body > section > div.content-wrap > div > div.pagination.pagination-multi > ul > li:nth-child(8) > span', 'text'], + ]; // $links = $content->find("a.thumbnail")->attrs("href"); $range = ".pb article"; - $list = $content->rules($rules)->range($range)->query()->getData();; + $list = $content->rules($rules)->range($range)->query()->getData(); dump($list->all()); + $parsedList = array_merge([], $list->all()); + $totalPageNumMatchedItems = $content->find("body > section > div.content-wrap > div > div.pagination.pagination-multi > ul > li:nth-child(8) > span")->texts(); + $totalPageNumStr = Arr::get($totalPageNumMatchedItems->all(), 0); + preg_match("#\d+#", $totalPageNumStr, $totalPageNumPregMatchedItems); + $totalPageNum = $totalPageNumPregMatchedItems[0]; // $totalPageNum = body > section > div.content-wrap > div > div.pagination.pagination-multi > ul > li:nth-child(8) > span // $links->dump(); + + for ($i = 2; $i < $totalPageNum; $i++) { + break; + $pageUrl = $rootUrl . "/page/" . $i; + $content = $this->getQueryInstance()->get($pageUrl); + $parsedList[] = $content->rules($rules)->range($range)->query()->getData()->all(); + CommonUtils::randomSleep(); + break; + } + dump($parsedList); + foreach ($parsedList as $item) { + $this->processAlbum($item); + exit; + } + } + + public function processAlbum($item) + { + $content = $this->getQueryInstance()->get($item["link"]); + $rules = [ + "link" => [".post-page-numbers", "href"] + ]; + $range = ".article-content > div:nth-child(1) a"; +// $list = $content->rules($rules)->range($range)->query()->getData(); + $list = $content->find(".article-content > div:nth-child(1) a")->attrs("href"); + dump($list->all()); + } }