Browse Source

add scrape comic album

feature/new_bilibili_and_instagram_sxs20191126
nicksxs 4 years ago
parent
commit
0ab8ee0e47
3 changed files with 45 additions and 16 deletions
  1. +0
    -6
      .idea/thriftCompiler.xml
  2. +1
    -1
      app/Console/Commands/ComicsScrape.php
  3. +44
    -9
      app/Services/ComicsService.php

+ 0
- 6
.idea/thriftCompiler.xml View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ThriftCompiler">
<compilers />
</component>
</project>

+ 1
- 1
app/Console/Commands/ComicsScrape.php View File

@ -39,7 +39,7 @@ class ComicsScrape extends Command
public function handle()
{
$comic = new ComicsService();
$comic->scrapeAllAlbum("https://www.003004.com/papa/page/1");
$comic->scrapeAllAlbum("https://www.003004.com/papa");
//
}
}

+ 44
- 9
app/Services/ComicsService.php View File

@ -3,11 +3,14 @@
namespace App\Services;
use App\Utils\CommonUtils;
use Illuminate\Support\Arr;
use QL\QueryList;
class ComicsService {
class ComicsService
{
private $queryInstance;
public function __construct()
{
$this->queryInstance = QueryList::getInstance();
@ -25,18 +28,50 @@ class ComicsService {
{
$content = $this->getQueryInstance()->get($rootUrl);
$rules = [
// 采集文章标题
'title' => ['.thumbnail','title'],
// 采集链接
'link' => ['.thumbnail','href'],
// 采集缩略图
'totalNum' => ['body > section > div.content-wrap > div > div.pagination.pagination-multi > ul > li:nth-child(8) > span','text'],
];
// 采集文章标题
'title' => ['.thumbnail', 'title'],
// 采集链接
'link' => ['.thumbnail', 'href'],
// 采集缩略图
'totalNum' => ['body > section > div.content-wrap > div > div.pagination.pagination-multi > ul > li:nth-child(8) > span', 'text'],
];
// $links = $content->find("a.thumbnail")->attrs("href");
$range = ".pb article";
$list = $content->rules($rules)->range($range)->query()->getData();;
$list = $content->rules($rules)->range($range)->query()->getData();
dump($list->all());
$parsedList = array_merge([], $list->all());
$totalPageNumMatchedItems = $content->find("body > section > div.content-wrap > div > div.pagination.pagination-multi > ul > li:nth-child(8) > span")->texts();
$totalPageNumStr = Arr::get($totalPageNumMatchedItems->all(), 0);
preg_match("#\d+#", $totalPageNumStr, $totalPageNumPregMatchedItems);
$totalPageNum = $totalPageNumPregMatchedItems[0];
// $totalPageNum = body > section > div.content-wrap > div > div.pagination.pagination-multi > ul > li:nth-child(8) > span
// $links->dump();
for ($i = 2; $i < $totalPageNum; $i++) {
break;
$pageUrl = $rootUrl . "/page/" . $i;
$content = $this->getQueryInstance()->get($pageUrl);
$parsedList[] = $content->rules($rules)->range($range)->query()->getData()->all();
CommonUtils::randomSleep();
break;
}
dump($parsedList);
foreach ($parsedList as $item) {
$this->processAlbum($item);
exit;
}
}
public function processAlbum($item)
{
$content = $this->getQueryInstance()->get($item["link"]);
$rules = [
"link" => [".post-page-numbers", "href"]
];
$range = ".article-content > div:nth-child(1) a";
// $list = $content->rules($rules)->range($range)->query()->getData();
$list = $content->find(".article-content > div:nth-child(1) a")->attrs("href");
dump($list->all());
}
}

Loading…
Cancel
Save