Browse Source

modify instagram and xiuren and so on

feature/new_bilibili_and_instagram_sxs20191126
shixuesen 3 years ago
parent
commit
b8387bf90c
22 changed files with 1048 additions and 210 deletions
  1. +53
    -0
      .env.prod.back
  2. +5
    -2
      app/Console/Commands/AcfunScrape.php
  3. +10
    -9
      app/Console/Commands/BiliVideoCode.php
  4. +1
    -0
      app/Console/Commands/InstagramScrape.php
  5. +34
    -16
      app/Console/Commands/RecordDir.php
  6. +21
    -2
      app/Console/Commands/RenameTest.php
  7. +21
    -2
      app/Console/Commands/XiurenjiScrape.php
  8. +21
    -20
      app/Console/Kernel.php
  9. +12
    -6
      app/Http/Controllers/WeiboController.php
  10. +1
    -0
      app/ImageRecord.php
  11. +5
    -5
      app/Services/AcfunService.php
  12. +11
    -6
      app/Services/BilibiliServiceV2.php
  13. +10
    -5
      app/Services/FfmpegService.php
  14. +94
    -3
      app/Services/InstagramService.php
  15. +2
    -0
      app/Services/NewNvshenService.php
  16. +22
    -9
      app/Services/RenameService.php
  17. +44
    -11
      app/Services/WeiboService.php
  18. +256
    -0
      app/Services/XiaoyuService.php
  19. +146
    -114
      app/Services/XiurenjiService.php
  20. +256
    -0
      app/Services/YouwuService.php
  21. +20
    -0
      app/Utils/helper.php
  22. +3
    -0
      fail.log

+ 53
- 0
.env.prod.back View File

@ -0,0 +1,53 @@
APP_NAME=Laravel
APP_ENV=prod
APP_KEY=base64:BNE3RoAp39H4EB1jWx0c1xJI3arhkasdMAz52FsCc1U=
APP_DEBUG=true
APP_URL=http://localhost
LOG_CHANNEL=stack
DB_CONNECTION=mysql
#DB_HOST=127.0.0.1
#DB_PORT=3306
#DB_DATABASE=ins
#DB_USERNAME=root
#DB_PASSWORD=Mff@0987654321
DB_HOST=111.231.219.223
DB_PORT=3306
DB_DATABASE=ins
DB_USERNAME=ns
DB_PASSWORD=949sxs949
BROADCAST_DRIVER=log
CACHE_DRIVER=redis
QUEUE_CONNECTION=sync
SESSION_DRIVER=file
SESSION_LIFETIME=120
REDIS_HOST=127.0.0.1
REDIS_PASSWORD=null
REDIS_PORT=6379
MAIL_DRIVER=smtp
MAIL_HOST=smtp.mailtrap.io
MAIL_PORT=2525
MAIL_USERNAME=null
MAIL_PASSWORD=null
MAIL_ENCRYPTION=null
PUSHER_APP_ID=
PUSHER_APP_KEY=
PUSHER_APP_SECRET=
PUSHER_APP_CLUSTER=mt1
MIX_PUSHER_APP_KEY="${PUSHER_APP_KEY}"
MIX_PUSHER_APP_CLUSTER="${PUSHER_APP_CLUSTER}"
OAUTH_APP_ID=89b50ce9-fcd4-4d6b-a830-bbfe0fa5703b
OAUTH_APP_PASSWORD=gmwsDOF38_+ycvIDAH885[)
OAUTH_REDIRECT_URI=http://localhost:8000/callback
#OAUTH_SCOPES='openid profile offline_access user.read calendars.read Files.Read.All, Files.ReadWrite.All, Sites.Read.All, Sites.ReadWrite.All'
OAUTH_SCOPES='openid profile offline_access user.read calendars.read'
OAUTH_AUTHORITY=https://login.microsoftonline.com/common
OAUTH_AUTHORIZE_ENDPOINT=/oauth2/v2.0/authorize
OAUTH_TOKEN_ENDPOINT=/oauth2/v2.0/token

+ 5
- 2
app/Console/Commands/AcfunScrape.php View File

@ -38,9 +38,12 @@ class AcfunScrape extends Command
*/ */
public function handle() public function handle()
{ {
// 几兔灰 10703951
// 香菜猫饼 4537972
// 小清晨儿 2277346
$service = new AcfunService(); $service = new AcfunService();
// $service->requestUpPageApi(4537972);exit;
// $service->queryUpUsersVideos(4537972);exit;
// $service->requestUpPageApi(10703951);exit;
// $service->queryUpUsersVideos(2277346);exit;
$service->downloadVideo(); $service->downloadVideo();
// //
} }


+ 10
- 9
app/Console/Commands/BiliVideoCode.php View File

@ -46,19 +46,20 @@ class BiliVideoCode extends Command
// dump($this->arguments());exit; // dump($this->arguments());exit;
// //
$bilibili = new BilibiliServiceV2(new BilibiliVideoRepository(App::getFacadeApplication())); $bilibili = new BilibiliServiceV2(new BilibiliVideoRepository(App::getFacadeApplication()));
// $bilibili->checkVideoHasDownload();
// $bilibili->checkVideoHasDownload();
// $bilibili->queryLocalUpVideoList();exit;
// $bilibili->insertDBTest();exit;
// $bilibili->queryPlayList();
// $bilibili->queryLocalUpVideoList();exit;
// $bilibili->insertDBTest();exit;
// $bilibili->queryPlayList();
$bilibili->queryUpVideoList(10278125);
// exit;
// $bilibili->queryUpVideoList(10278125);
// exit;
$bilibili->queryDBCollectionList();
// $bilibili->compareAndDownloadUpVideos(true);
// $bilibili->queryDBCollectionList();
// $bilibili->queryForVideoParts();
$bilibili->compareAndDownloadUpVideos(true);
// $bilibili->compareAndDownloadCollectionVideos(); // $bilibili->compareAndDownloadCollectionVideos();
// exit;
exit;
$bilibili->queryForVideoParts(); $bilibili->queryForVideoParts();
if (App::environment() == "local") { if (App::environment() == "local") {
# code... # code...


+ 1
- 0
app/Console/Commands/InstagramScrape.php View File

@ -51,6 +51,7 @@ class InstagramScrape extends Command
// print_r($userList);exit; // print_r($userList);exit;
$ins = new InstagramService(); $ins = new InstagramService();
// $ins->getUserNameById('4156629214');exit; // $ins->getUserNameById('4156629214');exit;
$ins->scrapeFeeds();
$ins->scrapeUsers($start); $ins->scrapeUsers($start);
$ins->scrapeLikedUsers(); $ins->scrapeLikedUsers();
$ins->scrapeCollection();exit; $ins->scrapeCollection();exit;


+ 34
- 16
app/Console/Commands/RecordDir.php View File

@ -41,33 +41,51 @@ class RecordDir extends Command
*/ */
public function handle() public function handle()
{ {
//
$path = trim($this->argument("path")); $path = trim($this->argument("path"));
$service = new DirService(); $service = new DirService();
$list = $service->recursiveScan($path); $list = $service->recursiveScan($path);
// $i = 0;
foreach ($list["files"] as $file) {
if (strstr($file, ".DS_Store")) {
if (isset($list["files"]) && count($list["files"]) > 0) {
foreach ($list["files"] as $file) {
if (strstr($file, ".DS_Store")) {
continue;
}
$fileInfo = pathinfo($file);
try {
$innerPath = str_replace($path, "", $fileInfo["dirname"]);
$innerName = $fileInfo["basename"];
ImageRecord::firstOrCreate(["path" => $innerPath, "name" => $innerName],
["path" => $innerPath,
"name" => $innerName,
"type" => 2
]
);
} catch (QueryException $e) {
if (!str_contains($e->getMessage(), "Duplicate entry")) {
Log::error($e->getMessage());
}
}
unset($imageRecord);
}
}
foreach ($list["dirs"] as $dir) {
if (strstr($dir, ".DS_Store") || $dir == ".." || $dir == "." || str_starts_with($dir, ".")) {
continue; continue;
} }
$fileInfo = pathinfo($file);
try { try {
$imageRecord = new ImageRecord();
$imageRecord->path = str_replace($path, "", $fileInfo["dirname"]);
$imageRecord->name = $fileInfo["basename"];
$imageRecord->type = 2;// dump($imageRecord->getAttributes());
// $i++;
// if ($i > 100) {
// exit;
// }
// continue;
$imageRecord->save();
ImageRecord::firstOrCreate(["path" => $path, "name" => $dir],
["path" => $path,
"name" => $dir,
"type" => 1
]
);
} catch (QueryException $e) { } catch (QueryException $e) {
if (!str_contains($e->getMessage(), "Duplicate entry")) { if (!str_contains($e->getMessage(), "Duplicate entry")) {
Log::error($e->getMessage()); Log::error($e->getMessage());
} }
} }
unset($imageRecord);
} }
} }


+ 21
- 2
app/Console/Commands/RenameTest.php View File

@ -4,6 +4,7 @@ namespace App\Console\Commands;
use App\Services\RenameService; use App\Services\RenameService;
use Illuminate\Console\Command; use Illuminate\Console\Command;
use function RingCentral\Psr7\str;
class RenameTest extends Command class RenameTest extends Command
{ {
@ -42,9 +43,27 @@ class RenameTest extends Command
$path = $this->argument("path"); $path = $this->argument("path");
$prefix = $this->argument("prefix"); $prefix = $this->argument("prefix");
$rename = new RenameService(); $rename = new RenameService();
dump($rename->splitCustomSizeOfFolder($path, $prefix, 500));exit;
// $files = scandir($path);
// foreach ($files as $file) {
// if (str_contains($file, "KID=imgbed,photo&")) {
// $pos = strpos($file, ".jpg");
// $newFileName = substr($file, 0, $pos + 4);
// dump("new filename ". $newFileName);
//
// rename($path . DIRECTORY_SEPARATOR . $file, $path . DIRECTORY_SEPARATOR . $newFileName);
// }
// }
// exit;
// $rename->rename($path, $prefix);
// $arr[0] = 1640486381;
// $arr[1] = 1640553754;
// $arr[2] = 1640488544;
// asort($arr);
// dump($arr);exit;
// $rename->rename($path, $prefix);exit;
$rename->splitCustomSizeOfFolder($path, $prefix, 500);exit;
// $rename->rename("/Volumes/WD/tmp/写真图/猫九", "猫九-"); // $rename->rename("/Volumes/WD/tmp/写真图/猫九", "猫九-");
$rename->rename($path, $prefix);
return; return;
$rename->rename("/Volumes/Backup/images/写真/pcBack/3/", ""); $rename->rename("/Volumes/Backup/images/写真/pcBack/3/", "");
// $rename->rename(); // $rename->rename();


+ 21
- 2
app/Console/Commands/XiurenjiScrape.php View File

@ -2,7 +2,9 @@
namespace App\Console\Commands; namespace App\Console\Commands;
use App\Services\XiaoyuService;
use App\Services\XiurenjiService; use App\Services\XiurenjiService;
use App\Services\YouwuService;
use Illuminate\Console\Command; use Illuminate\Console\Command;
class XiurenjiScrape extends Command class XiurenjiScrape extends Command
@ -12,7 +14,7 @@ class XiurenjiScrape extends Command
* *
* @var string * @var string
*/ */
protected $signature = 'xiuren:s';
protected $signature = 'xiuren:s {all} {site} {num} {start}';
/** /**
* The console command description. * The console command description.
@ -39,8 +41,25 @@ class XiurenjiScrape extends Command
public function handle() public function handle()
{ {
// //
$isAll = $this->argument('all');
$site = $this->argument("site");
$num = $this->argument("num");
$start = $this->argument("start");
// if ($site == "xiuren") {
$service = new XiurenjiService(); $service = new XiurenjiService();
// } else if ($site == "xiaoyu") {
// $service = new XiaoyuService();
// } else if ($site == "youwu"){
// $service = new YouwuService();
// }
if ($isAll == "1") {
$service->scrapeAll();
} else {
$service->scrapeAlbum($site, $num, $start);
}
// $service->scrapeSingleAlbum("https://www.xiurenji.vip/XiuRen/7828.html");exit; // $service->scrapeSingleAlbum("https://www.xiurenji.vip/XiuRen/7828.html");exit;
$service->scrapeAlbum();
} }
} }

+ 21
- 20
app/Console/Kernel.php View File

@ -2,7 +2,6 @@
namespace App\Console; namespace App\Console;
use App;
use App\Services\BilibiliService; use App\Services\BilibiliService;
use App\Services\BilibiliServiceV2; use App\Services\BilibiliServiceV2;
use App\Services\CompressImageService; use App\Services\CompressImageService;
@ -13,10 +12,12 @@ use App\Services\TujiguService;
use App\Services\XiuGirlsService; use App\Services\XiuGirlsService;
use App\Services\VitabioticsService; use App\Services\VitabioticsService;
use App\Services\YouKnowService; use App\Services\YouKnowService;
use App\Repositories\BilibiliVideoRepository;
use Illuminate\Console\Scheduling\Schedule; use Illuminate\Console\Scheduling\Schedule;
use Illuminate\Foundation\Console\Kernel as ConsoleKernel; use Illuminate\Foundation\Console\Kernel as ConsoleKernel;
use App\Services\InstagramService; use App\Services\InstagramService;
use Illuminate\Support\Facades\Log; use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\App;
date_default_timezone_set('PRC'); date_default_timezone_set('PRC');
@ -40,44 +41,44 @@ class Kernel extends ConsoleKernel
protected function schedule(Schedule $schedule) protected function schedule(Schedule $schedule)
{ {
$schedule->call(function () { $schedule->call(function () {
if ((rand(0, 99) % 2) == 0) {
exit();
}
if(date('H') % 4 != 0) {
Log::info("schedule queryUpVideoList skipped at: " .date("Y-m-d H:i:s"));
exit;
}
$bilibili = new BilibiliServiceV2();
if ((rand(0, 99) % 2) == 0) {
exit();
}
if(date('H') % 3 != 0) {
Log::info("schedule queryUpVideoList skipped at: " .date("Y-m-d H:i:s"));
exit;
}
$bilibili = new BilibiliServiceV2(new BilibiliVideoRepository(App::getFacadeApplication()));
Log::info("schedule queryUpVideoList started at: ". date("Y-m-d H:i:s")); Log::info("schedule queryUpVideoList started at: ". date("Y-m-d H:i:s"));
$bilibili->queryUpVideoList(); $bilibili->queryUpVideoList();
})->hourlyAt(5); })->hourlyAt(5);
// })->yearly(); // })->yearly();
$schedule->call(function () { $schedule->call(function () {
if ((rand(0, 99) % 2) == 1) {
exit();
}
if(date('H') % 4 != 0) {
Log::info("schedule queryDBCollectionList skipped at: " .date("Y-m-d H:i:s"));
exit;
}
$bilibili = new BilibiliServiceV2();
if ((rand(0, 99) % 2) == 1) {
exit();
}
if(date('H') % 3 != 0) {
Log::info("schedule queryDBCollectionList skipped at: " .date("Y-m-d H:i:s"));
exit;
}
$bilibili = new BilibiliServiceV2(new BilibiliVideoRepository(App::getFacadeApplication()));
Log::info("schedule queryDBCollectionList started at: ". date("Y-m-d H:i:s")); Log::info("schedule queryDBCollectionList started at: ". date("Y-m-d H:i:s"));
$bilibili->queryDBCollectionList(); $bilibili->queryDBCollectionList();
})->hourlyAt(35); })->hourlyAt(35);
// })->yearly(); // })->yearly();
$schedule->call(function () { $schedule->call(function () {
$bilibili = new BilibiliServiceV2();
$bilibili = new BilibiliServiceV2(new BilibiliVideoRepository(App::getFacadeApplication()));
Log::info("schedule queryForVideoParts started at: ". date("Y-m-d H:i:s")); Log::info("schedule queryForVideoParts started at: ". date("Y-m-d H:i:s"));
$bilibili->queryForVideoParts(); $bilibili->queryForVideoParts();
})->dailyAt("02:00"); })->dailyAt("02:00");
$schedule->call(function () { $schedule->call(function () {
$bilibili = new BilibiliServiceV2();
$bilibili = new BilibiliServiceV2(new BilibiliVideoRepository(App::getFacadeApplication()));
Log::info("schedule compareAndDownloadUpVideos started at: ". date("Y-m-d H:i:s")); Log::info("schedule compareAndDownloadUpVideos started at: ". date("Y-m-d H:i:s"));
// $bilibili->compareAndDownloadUpVideos(); // $bilibili->compareAndDownloadUpVideos();
})->dailyAt('03:00'); })->dailyAt('03:00');
$schedule->call(function () { $schedule->call(function () {
$bilibili = new BilibiliServiceV2();
$bilibili = new BilibiliServiceV2(new BilibiliVideoRepository(App::getFacadeApplication()));
Log::info("schedule compareAndDownloadCollectionVideos started at: ". date("Y-m-d H:i:s")); Log::info("schedule compareAndDownloadCollectionVideos started at: ". date("Y-m-d H:i:s"));
// $bilibili->compareAndDownloadCollectionVideos(); // $bilibili->compareAndDownloadCollectionVideos();
})->dailyAt('04:00'); })->dailyAt('04:00');


+ 12
- 6
app/Http/Controllers/WeiboController.php View File

@ -30,6 +30,12 @@ class WeiboController extends Controller
// https://m.weibo.cn/api/container/getIndex?containerid=230259&openApp=0&page= // https://m.weibo.cn/api/container/getIndex?containerid=230259&openApp=0&page=
// 物理 2304133907143723 // 物理 2304133907143723
// 230259 自己
// 轮子哥 2304131916825084
// 徐圣佑 5893812490
// 徐圣佑- 新号 1076035893812490
$url = array(); $url = array();
@ -38,11 +44,11 @@ class WeiboController extends Controller
// $url[] = 'https://m.weibo.cn/api/container/getIndex?containerid=1076035893812490&openApp=0&page='.$i; // $url[] = 'https://m.weibo.cn/api/container/getIndex?containerid=1076035893812490&openApp=0&page='.$i;
// for( $i = 100; $i >= 1 ; $i-- ) // for( $i = 100; $i >= 1 ; $i-- )
// $url[] = 'https://m.weibo.cn/api/container/getIndex?containerid=2304133907143723&openApp=0&page='.$i; // $url[] = 'https://m.weibo.cn/api/container/getIndex?containerid=2304133907143723&openApp=0&page='.$i;
// for ($i = 5; $i >= 1; $i--) {
// $url[] = 'https://m.weibo.cn/feed/group?gid=4423532052076817&&page=' . $i;
// }
for ($i = 1; $i < 10; $i++) {
$url[] = 'https://m.weibo.cn/api/container/getIndex?containerid=230259&&page=' . $i;
// for ($i = 10; $i >= 1; $i--) {
// $url[] = 'https://m.weibo.cn/feed/group?gid=4423532052076817&&page=' . $i;
// }
for ($i = 5; $i >= 1; $i--) {
$url[] = 'https://m.weibo.cn/api/container/getIndex?containerid=2304131916825084&page=' . $i;
} }
return response()->json($url); return response()->json($url);
} }
@ -69,7 +75,7 @@ class WeiboController extends Controller
// Log::info($request->input()); // Log::info($request->input());
$weibo = new WeiboService(); $weibo = new WeiboService();
$result = $weibo->scrapeWeiboPicAndVideo($request->input("content")); $result = $weibo->scrapeWeiboPicAndVideo($request->input("content"));
// $result = $weibo->scrapeGroupWeiboPicAndVideo($request->input("content"));
// $result = $weibo->scrapeGroupWeiboPicAndVideo($request->input("content"));
return response()->json($result); return response()->json($result);
} }


+ 1
- 0
app/ImageRecord.php View File

@ -27,4 +27,5 @@ use Illuminate\Database\Eloquent\Model;
class ImageRecord extends Model class ImageRecord extends Model
{ {
// //
protected $guarded = [''];
} }

+ 5
- 5
app/Services/AcfunService.php View File

@ -128,7 +128,7 @@ class AcfunService
public function downloadVideo() public function downloadVideo()
{ {
$innerDir = "/Volumes/Crucial X6/Video/acfun"; $innerDir = "/Volumes/Crucial X6/Video/acfun";
$list = AcfunVideo::all();
$list = AcfunVideo::orderBy('created_at', 'desc')->get();
$files = scandir($innerDir); $files = scandir($innerDir);
// dump($files); // dump($files);
@ -142,7 +142,7 @@ class AcfunService
} }
Log::info($item['title'] . " (" . $item["from_up_name"] . ").mp4" . " does not exists to download"); Log::info($item['title'] . " (" . $item["from_up_name"] . ").mp4" . " does not exists to download");
// exit; // exit;
$downloadResult = shell_exec('cd "' .$innerDir .'" && annie https://www.acfun.cn/v/ac' . $item["content_id"]);
$downloadResult = shell_exec('cd "' .$innerDir .'" && you-get https://www.acfun.cn/v/ac' . $item["content_id"]);
Log::info($downloadResult); Log::info($downloadResult);
$item["is_downloaded"] = 1; $item["is_downloaded"] = 1;
$item->save(); $item->save();
@ -166,13 +166,13 @@ class AcfunService
"title" => $titles[$key], "title" => $titles[$key],
"from_type" => 2, "from_type" => 2,
"from_collection_name" => "", "from_collection_name" => "",
"from_up_name" => "香菜猫饼",
"from_up_name" => "几兔灰",
"from_up_user_id" => $upId "from_up_user_id" => $upId
]); ]);
} }
$queried += $result["pageSize"]; $queried += $result["pageSize"];
Log::info("current queried: " . $queried); Log::info("current queried: " . $queried);
if ($result["noMore"] || $queried > (int)$result["totalCount"]) {
if (array_key_exists("noMore", $result) || $queried > (int)$result["totalCount"]) {
break; break;
} }
$pCursor = $result["pcursor"]; $pCursor = $result["pcursor"];
@ -218,7 +218,7 @@ class AcfunService
} }
public function requestUpPageApi($upId, $pcursor = "") {
public function requestUpPageApi($upId, $pcursor = "") {
if ($upId == null) { if ($upId == null) {
return ""; return "";
} }


+ 11
- 6
app/Services/BilibiliServiceV2.php View File

@ -28,7 +28,10 @@ class BilibiliServiceV2
private $baseDir = "/Volumes/intel660p/video/mv/"; private $baseDir = "/Volumes/intel660p/video/mv/";
private $remoteDir = "/data/";
// private $remoteDir = "/data/";
private $remoteDir = "/Volumes/Crucial X6/Video/";
protected $repository; protected $repository;
@ -250,8 +253,9 @@ class BilibiliServiceV2
public function compareAndDownloadUpVideos($isAll = false) public function compareAndDownloadUpVideos($isAll = false)
{ {
$env = App::environment(); $env = App::environment();
$list = BilibiliUpVideos::all();
// $list = array_slice($list->all(), 15, 5);
$list = BilibiliUpVideos::orderBy("created_at", "desc")->get();
$list = array_slice($list->all(), 0, 2);
// dump($list);exit;
foreach ($list as $item) { foreach ($list as $item) {
dump("当前 up名称是: " . $item["up_name"] . "\n"); dump("当前 up名称是: " . $item["up_name"] . "\n");
if ($item["is_downloaded"] == 1) { if ($item["is_downloaded"] == 1) {
@ -656,7 +660,7 @@ class BilibiliServiceV2
public function queryForVideoParts() public function queryForVideoParts()
{ {
$i = 1; $i = 1;
$list = BilibiliVideos::orderBy('id', 'desc')->simplePaginate(200, null, 'page', $i);
$list = BilibiliVideos::orderBy('id', 'desc')->simplePaginate(2000, null, 'page', $i);
// dump($list->items()[0]->aid); // dump($list->items()[0]->aid);
while ($list->isNotEmpty()) { while ($list->isNotEmpty()) {
foreach ($list->items() as $item) { foreach ($list->items() as $item) {
@ -972,8 +976,9 @@ done && echo "ok"');
$downloadResult = shell_exec('cd "' . $innerDir . '" && url="https://www.bilibili.com/video/av' . $aid . '?p=" $downloadResult = shell_exec('cd "' . $innerDir . '" && url="https://www.bilibili.com/video/av' . $aid . '?p="
for i in $(seq 1 ' . $parts . ') for i in $(seq 1 ' . $parts . ')
do do
annie -c "SESSDATA=94247a4e%2C1651981649%2C1dba1%2Ab1;" $url$i
lux -c "SESSDATA=94247a4e%2C1651981649%2C1dba1%2Ab1;" $url$i
done && echo "ok"'); done && echo "ok"');
//
} }
Log::info($downloadResult); Log::info($downloadResult);
Log::info("$aid current download result: " . $downloadResult); Log::info("$aid current download result: " . $downloadResult);
@ -1055,7 +1060,7 @@ done && echo "ok"');
dump($list); dump($list);
} }
public function checkDiskSpace($dir = "/data")
public function checkDiskSpace($dir = "/Volumes/Crucial X6/Video/bilibili/")
{ {
if (disk_free_space($dir) > 5 * 1024 * 1024 * 1024) { if (disk_free_space($dir) > 5 * 1024 * 1024 * 1024) {
return true; return true;


+ 10
- 5
app/Services/FfmpegService.php View File

@ -119,11 +119,13 @@ class FfmpegService
Log::info("in uneed: " . $fileInfo["filename"]); Log::info("in uneed: " . $fileInfo["filename"]);
return; return;
} }
if (!$this->checkFileSize($pathFile)) {
if (Redis::sismember("sizeSmall", $fileInfo["filename"]) || !$this->checkFileSize($pathFile)) {
Redis::sadd("sizeSmall", $fileInfo["filename"]);
Log::info("filesize: " . $fileInfo["filename"]); Log::info("filesize: " . $fileInfo["filename"]);
return; return;
} }
if ($this->checkFileEncodeType($pathFile)) {
if (Redis::sismember("hasEncode", $fileInfo["filename"]) || $this->checkFileEncodeType($pathFile)) {
Redis::sadd("hasEncode", $fileInfo["filename"]);
Log::info("$pathFile has already encode by h265 return"); Log::info("$pathFile has already encode by h265 return");
return; return;
} }
@ -140,6 +142,7 @@ class FfmpegService
} }
$targetFile = $fileInfo["dirname"] . '/' .$fileInfo["filename"] . '-x265'. '.' . $fileInfo["extension"]; $targetFile = $fileInfo["dirname"] . '/' .$fileInfo["filename"] . '-x265'. '.' . $fileInfo["extension"];
if (is_file($targetFile)) { if (is_file($targetFile)) {
Log::info("$targetFile is exists");
unlink($pathFile); unlink($pathFile);
rename($targetFile, $pathFile); rename($targetFile, $pathFile);
return; return;
@ -147,8 +150,8 @@ class FfmpegService
dump("targetFile", [$targetFile]); dump("targetFile", [$targetFile]);
Log::info("process target file : $targetFile"); Log::info("process target file : $targetFile");
$result = shell_exec("ffmpeg -threads 4 -i ". escapeshellarg($pathFile) ." -preset ultrafast -c:v libx265 -vtag hvc1 " . escapeshellarg($targetFile) . " && echo 'ok'"); $result = shell_exec("ffmpeg -threads 4 -i ". escapeshellarg($pathFile) ." -preset ultrafast -c:v libx265 -vtag hvc1 " . escapeshellarg($targetFile) . " && echo 'ok'");
echo $result;
return;
// echo $result;
// return;
if (trim($result) == "ok") { if (trim($result) == "ok") {
echo "compress work done remove the file \n"; echo "compress work done remove the file \n";
Log::info("compress work done remove the file"); Log::info("compress work done remove the file");
@ -253,6 +256,7 @@ class FfmpegService
public function checkFileSize($file, $size = 1): bool public function checkFileSize($file, $size = 1): bool
{ {
if (is_file($file) && filesize($file) > 1 * 1024 * 1024) { if (is_file($file) && filesize($file) > 1 * 1024 * 1024) {
return true; return true;
} }
@ -270,7 +274,8 @@ class FfmpegService
->videos() // filters video streams ->videos() // filters video streams
->first() // returns the first video stream ->first() // returns the first video stream
->get('codec_name'); ->get('codec_name');
} catch (\Exception $e) {
} catch (\Throwable $e) {
echo "error $file \n";
Log::error("ffprobe has error just return false for test, exception: ". $e->getMessage()); Log::error("ffprobe has error just return false for test, exception: ". $e->getMessage());
return false; return false;
} }


+ 94
- 3
app/Services/InstagramService.php View File

@ -152,6 +152,11 @@ class InstagramService
if ($pos > 0) { if ($pos > 0) {
$filename = substr($filename, 0, $pos); $filename = substr($filename, 0, $pos);
} }
if (file_exists($filePrefix . $filename) && $fileNamePrefix != null) {
rename($filePrefix . $filename, $filePrefix . $fileNamePrefix . $filename);
echo "\n file exists and has rename to " . $filePrefix . $fileNamePrefix . $filename;
return 0;
}
$filename = $fileNamePrefix . $filename; $filename = $fileNamePrefix . $filename;
// if ($filename == "33020038_640464766303508_27725890796388352_n.jpg"){ // if ($filename == "33020038_640464766303508_27725890796388352_n.jpg"){
// $flag = 1; // $flag = 1;
@ -239,17 +244,18 @@ class InstagramService
$response = $ig->media->getLikedFeed(); $response = $ig->media->getLikedFeed();
foreach ($response->getItems() as $item) { foreach ($response->getItems() as $item) {
//echo json_encode($response->getItems());exit; //echo json_encode($response->getItems());exit;
$userName = $item->getUser()->getUsername() . "_";
switch ($item->getMediaType()) { switch ($item->getMediaType()) {
case Item::PHOTO: case Item::PHOTO:
$imageUrl = $item->getImageVersions2()->getCandidates()[0]->getUrl(); $imageUrl = $item->getImageVersions2()->getCandidates()[0]->getUrl();
$res = $this->downloadFile($imageUrl, 0, $baseImageDir);
$res = $this->downloadFile($imageUrl, 0, $baseImageDir, $userName);
if ($res == 0) { if ($res == 0) {
return; return;
} }
break; break;
case Item::VIDEO: case Item::VIDEO:
$videoUrl = $item->getVideoVersions()[0]->getUrl(); $videoUrl = $item->getVideoVersions()[0]->getUrl();
$res = $this->downloadFile($videoUrl, 0, $baseImageDir);
$res = $this->downloadFile($videoUrl, 0, $baseImageDir, $userName);
if ($res == 0) { if ($res == 0) {
return; return;
} }
@ -259,7 +265,7 @@ class InstagramService
// exit; // exit;
foreach ($item->getCarouselMedia() as $imageItem) { foreach ($item->getCarouselMedia() as $imageItem) {
$imageUrl = $imageItem->getImageVersions2()->getCandidates()[0]->getUrl(); $imageUrl = $imageItem->getImageVersions2()->getCandidates()[0]->getUrl();
$res = $this->downloadFile($imageUrl, 0, $baseImageDir);
$res = $this->downloadFile($imageUrl, 0, $baseImageDir, $userName);
if ($res == 0) { if ($res == 0) {
return; return;
} }
@ -287,6 +293,91 @@ class InstagramService
} }
} }
public function scrapeFeeds()
{
$ig = new Instagram($this->debug, $this->truncatedDebug);
try {
$ig->login($this->username, $this->password);
} catch
(\Exception $e) {
echo 'Something went wrong: ' . $e->getMessage() . "\n";
exit(0);
}
$count = 0;
$baseImageDir = "/Users/shixuesen/OneDrive/Pictures/instagram/Likes/";
try {
$maxId = null;
do {
$response = $ig->timeline->getTimelineFeed($maxId);
foreach ($response->getFeedItems() as $item) {
if ($item->getMediaOrAd() == null || $item->getMediaOrAd()->getProductType() == "ad") {
continue;
}
// else{
// echo json_encode($item->getMediaOrAd()) . "\n";
// echo "product type: " . $item->getMediaOrAd()->getProductType(). "\n";
// echo $item->getMediaOrAd()->getUser()->getUsername() . "\n";
// echo "ad id ";
// echo $item->getMediaOrAd()->getAdId() ;
// echo "\n";
// echo "is add4ad ". $item->isAd4ad() . "\n";
// echo "is add link type" . $item->isAdLinkType() . "\n";
// echo "is media or ad" . $item->isMediaOrAd() . "\n";
// }
// if ()
$userName = $item->getMediaOrAd()->getUser()->getUsername() . "_";
switch ($item->getMediaOrAd()->getMediaType()) {
case Item::PHOTO:
$imageUrl = $item->getMediaOrAd()->getImageVersions2()->getCandidates()[0]->getUrl();
$res = $this->downloadFile($imageUrl, 0, $baseImageDir, $userName);
// if ($res == 0) {
// return;
// }
break;
case Item::VIDEO:
$videoUrl = $item->getMediaOrAd()->getVideoVersions()[0]->getUrl();
$res = $this->downloadFile($videoUrl, 0, $baseImageDir, $userName);
// if ($res == 0) {
// return;
// }
break;
case Item::CAROUSEL:
foreach ($item->getMediaOrAd()->getCarouselMedia() as $imageItem) {
$imageUrl = $imageItem->getImageVersions2()->getCandidates()[0]->getUrl();
$res = $this->downloadFile($imageUrl, 0, $baseImageDir, $userName);
// if ($res == 0) {
// return;
// }
}
break;
}
$count++;
if ($count > 200) {
return;
}
}
// Now we must update the maxId variable to the "next page".
// This will be a null value again when we've reached the last page!
// And we will stop looping through pages as soon as maxId becomes null.
$maxId = $response->getNextMaxId();
echo "\n new maxId: " . $maxId . "\n";
// Sleep for 5 seconds before requesting the next page. This is just an
// example of an okay sleep time. It is very important that your scripts
// always pause between requests that may run very rapidly, otherwise
// Instagram will throttle you temporarily for abusing their API!
echo "\n Sleeping for 5s...\n";
sleep(5 * random_int(1, 10));
} while ($maxId != null);
} catch (\Exception $e) {
echo 'Something went wrong: ' . $e->getMessage() . "\n";
}
}
public function scrapeUsers($start = 0) public function scrapeUsers($start = 0)
{ {
$ig = new Instagram($this->debug, $this->truncatedDebug); $ig = new Instagram($this->debug, $this->truncatedDebug);


+ 2
- 0
app/Services/NewNvshenService.php View File

@ -13,6 +13,7 @@ use Illuminate\Support\Facades\Storage;
class NewNvshenService class NewNvshenService
{ {
// 夏西cici 28139
private static $name_dir = [ private static $name_dir = [
"周韦彤" => [ "周韦彤" => [
@ -191,6 +192,7 @@ class NewNvshenService
$page = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find(".albumInfo > span")->htmls(); $page = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find(".albumInfo > span")->htmls();
$title = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find(".albumTitle > #htilte")->htmls(); $title = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find(".albumTitle > #htilte")->htmls();
$imageSource = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find("#hgallery img:nth-child(1)")->attr("src"); $imageSource = $mainQl->get($baseUrl . $album, [], ['maxTry' => 5])->find("#hgallery img:nth-child(1)")->attr("src");
dump($imageSource);exit;
$imageSourceParts = pathinfo($imageSource, PATHINFO_DIRNAME); $imageSourceParts = pathinfo($imageSource, PATHINFO_DIRNAME);
break; break;
} catch (ConnectException $e) { } catch (ConnectException $e) {


+ 22
- 9
app/Services/RenameService.php View File

@ -131,23 +131,35 @@ class RenameService
public function splitCustomSizeOfFolder($dir = "", $prefix = "", $size = 500) public function splitCustomSizeOfFolder($dir = "", $prefix = "", $size = 500)
{ {
$files = $this->recordAllFiles($dir, $prefix); $files = $this->recordAllFiles($dir, $prefix);
asort($files, SORT_NUMERIC);
$files = array_keys($files);
$allFileNum = count($files); $allFileNum = count($files);
$folderNum = ceil(count($files) / $size); $folderNum = ceil(count($files) / $size);
for ($i=0; $i < $folderNum; $i++) {
for ($i=0; $i < $folderNum; $i++) {
# code... # code...
$currentDirName = $dir . DIRECTORY_SEPARATOR . $prefix . "_00" .$i; $currentDirName = $dir . DIRECTORY_SEPARATOR . $prefix . "_00" .$i;
if (!is_dir($currentDirName)) { if (!is_dir($currentDirName)) {
mkdir($currentDirName); mkdir($currentDirName);
} }
for ($j=0 + $i * $size; $j < ($i + 1) * $size && $j < $allFileNum; $j++) {
for ($j=0 + $i * $size; $j < ($i + 1) * $size && $j < $allFileNum; $j++) {
$fileInfo = pathinfo($files[$j]); $fileInfo = pathinfo($files[$j]);
if (is_file($currentDirName . DIRECTORY_SEPARATOR . $fileInfo['basename'])) {
echo "file " . $currentDirName . DIRECTORY_SEPARATOR . $fileInfo['basename'] . " already exists\n";
echo "now rename {$files[$j]} to " . $currentDirName . DIRECTORY_SEPARATOR . $fileInfo['filename'] . "_1." . $fileInfo['extension'] . "\n";
rename($files[$j], $currentDirName . DIRECTORY_SEPARATOR . $fileInfo['filename'] . "_1." . $fileInfo['extension']);
} else{
rename($files[$j], $currentDirName . DIRECTORY_SEPARATOR . $fileInfo['basename']);
$newFileName = $currentDirName . DIRECTORY_SEPARATOR . $fileInfo['filename'];
$t = 0;
$trueNewFileName = $newFileName . "." . $fileInfo["extension"];
while (is_file($trueNewFileName)) {
echo "file " . $trueNewFileName . " already exists\n";
$trueNewFileName = $newFileName . "_" . $t++ . "." . $fileInfo["extension"];
echo "now rename {$files[$j]} to " . $trueNewFileName . "\n";
} }
rename($files[$j], $trueNewFileName);
// if (is_file($currentDirName . DIRECTORY_SEPARATOR . $fileInfo['basename'])) {
// echo "file " . $currentDirName . DIRECTORY_SEPARATOR . $fileInfo['basename'] . " already exists\n";
// echo "now rename {$files[$j]} to " . $currentDirName . DIRECTORY_SEPARATOR . $fileInfo['filename'] . "_1." . $fileInfo['extension'] . "\n";
// rename($files[$j], $currentDirName . DIRECTORY_SEPARATOR . $fileInfo['filename'] . "_1." . $fileInfo['extension']);
// } else{
// rename($files[$j], $currentDirName . DIRECTORY_SEPARATOR . $fileInfo['basename']);
// }
} }
} }
} }
@ -164,7 +176,8 @@ class RenameService
$trueFiles = array_merge($trueFiles, $this->recordAllFiles($dir . DIRECTORY_SEPARATOR . $file, $prefix)); $trueFiles = array_merge($trueFiles, $this->recordAllFiles($dir . DIRECTORY_SEPARATOR . $file, $prefix));
} }
if (is_file($dir . DIRECTORY_SEPARATOR . $file)) { if (is_file($dir . DIRECTORY_SEPARATOR . $file)) {
$trueFiles[] = $dir . DIRECTORY_SEPARATOR . $file;
$filetime = filectime($dir . DIRECTORY_SEPARATOR . $file);
$trueFiles[$dir . DIRECTORY_SEPARATOR . $file] = $filetime;
} }
} }
} }


+ 44
- 11
app/Services/WeiboService.php View File

@ -1,6 +1,7 @@
<?php <?php
namespace App\Services; namespace App\Services;
use App\ImageRecord;
use Illuminate\Support\Arr; use Illuminate\Support\Arr;
use Illuminate\Support\Facades\Log; use Illuminate\Support\Facades\Log;
@ -10,6 +11,15 @@ date_default_timezone_set('UTC');
class WeiboService class WeiboService
{ {
private $files = [];
private $videoDir = "/Volumes/Crucial X6/Image/weibo/video/";
private $imageDir = "/Volumes/Crucial X6/Image/weibo/image/";
public function __construct()
{
$this->files = array_merge($this->files, scanFilesWithoutPath($this->imageDir));
$this->files = array_merge($this->files, scanFilesWithoutPath($this->videoDir));
}
public function scrapeWeiboPicAndVideo($content) public function scrapeWeiboPicAndVideo($content)
{ {
@ -113,8 +123,10 @@ class WeiboService
public function scrapeGroupWeiboPicAndVideo($content) public function scrapeGroupWeiboPicAndVideo($content)
{ {
$video_dir = "/Volumes/Samsung/weibo/video";
$image_dir = "/Volumes/Samsung/weibo/image";
// $video_dir = "/Volumes/Samsung/weibo/video";
// $image_dir = "/Volumes/Samsung/weibo/image";
$video_dir = "/Volumes/Crucial X6/Image/weibo/video/";
$image_dir = "/Volumes/Crucial X6/Image/weibo/image/";
try { try {
if (strlen($content) > 0) { if (strlen($content) > 0) {
@ -203,10 +215,10 @@ class WeiboService
return ((float)$usec + (float)$sec); return ((float)$usec + (float)$sec);
} }
function process_pic($pics, $subdir, $user, $text)
function process_pic($pics, $subDir, $user, $text)
{ {
if (!file_exists($subdir)) {
mkdir($subdir);
if (!file_exists($subDir)) {
mkdir($subDir);
} }
$h2w = 0; $h2w = 0;
foreach ($pics as $pic) { foreach ($pics as $pic) {
@ -224,17 +236,21 @@ class WeiboService
if ($h2w > 15) { if ($h2w > 15) {
continue; continue;
} }
$pic_name = pathinfo($pic_url, PATHINFO_FILENAME);
$pic_ext = pathinfo($pic_url, PATHINFO_EXTENSION);
$picName = pathinfo($pic_url, PATHINFO_FILENAME);
$picExt = pathinfo($pic_url, PATHINFO_EXTENSION);
$user_name = Arr::get($user, "screen_name", ''); $user_name = Arr::get($user, "screen_name", '');
if ($user_name != '') { if ($user_name != '') {
$pic_name = $user_name . '--' .$pic_name;
$picName = $user_name . '--' .$picName;
} }
$file_name = $subdir . "/" . $pic_name . "." . $pic_ext;
if (!file_exists($file_name)) {
$file_name = $subDir . DIRECTORY_SEPARATOR . $picName . "." . $picExt;
$baseName = $picName . "." . $picExt;
if (!file_exists($file_name) && !$this->checkFileHasDownload($baseName)) {
$pic_content = file_get_contents($pic_url); $pic_content = file_get_contents($pic_url);
// echo $pic_content; // echo $pic_content;
file_put_contents($file_name, $pic_content); file_put_contents($file_name, $pic_content);
$this->files[] = $baseName;
} else {
Log::info("$baseName file exists");
} }
} }
} }
@ -261,10 +277,27 @@ class WeiboService
// $video_origin_name = pathinfo($video_url, PATHINFO_FILENAME); // $video_origin_name = pathinfo($video_url, PATHINFO_FILENAME);
// $video_ext = pathinfo($video_url, PATHINFO_EXTENSION); // $video_ext = pathinfo($video_url, PATHINFO_EXTENSION);
$file_name = $subdir . "/" . $video_name . "--" . $video_origin_name; $file_name = $subdir . "/" . $video_name . "--" . $video_origin_name;
if (!file_exists($file_name)) {
$baseName = $video_name . "--" . $video_origin_name;
if (!file_exists($file_name) && !$this->checkFileHasDownload($baseName)) {
# code... # code...
$video_content = file_get_contents($video_url); $video_content = file_get_contents($video_url);
file_put_contents($file_name, $video_content); file_put_contents($file_name, $video_content);
$this->files[] = $baseName;
} else {
Log::info("$baseName file exists");
}
}
private function checkFileHasDownload($fileName)
{
if (in_array($fileName, $this->files)) {
Log::info("$fileName exist in local files");
return true;
}
$record = ImageRecord::where("name", $fileName)->first();
if ($record != null) {
Log::info("$fileName exist in db");
return true;
} }
} }


+ 256
- 0
app/Services/XiaoyuService.php View File

@ -0,0 +1,256 @@
<?php
namespace App\Services;
use App\ImageRecord;
use Exception;
use ErrorException;
use Log;
use QL\QueryList;
class XiaoyuService
{
public $domainUrl = "https://www.xiurenb.net/";
public $xiurenRootUrl = "https://www.xiurenb.net/XiaoYu/";
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
// public $rootDir = "/Volumes/Backup/images/xiuren/";
public $rootDir = "/Volumes/Crucial X6/Image/xiaoyu/";
public $queryInstance;
public $queryNew;
public function __construct()
{
$this->queryInstance = QueryList::getInstance();
$this->queryNew = new QueryList();
}
public function scrapeAlbum()
{
$pageSize = 20;
$pageCount = $this->getEncodeHtmlContent("https://www.xiurenb.net/XiaoYu/index.html")->find(".page span strong")->htmls()->get(0);
print_r($pageCount);
if ((int)$pageCount > 0) {
$pageCount = 20;
for ($i = 0; $i <= ceil($pageCount / $pageSize); $i++) {
$urlSuffix = "";
if ($i == 0) {
$urlSuffix = "index.html";
} else {
$urlSuffix = "index" . $i . ".html";
}
$this->scrapePageAlbum($this->xiurenRootUrl . $urlSuffix);
}
}
}
public function scrapePageAlbum($url)
{
$pageContent = $this->getEncodeHtmlContent($url);
$items = $pageContent->find(".i_list a")->getElements();
$i = 0;
foreach ($items as $item) {
// $i++;
// if ($i < 11) {
// continue;
// # code...
// }
dump($item->getAttribute("href"));
$this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href"));
}
}
public function scrapeSingleAlbum($url)
{
Log::info("scrapeSingleAlbum $url");
$pageContent = $this->getEncodeHtmlContent($url);
$albumName = $pageContent->find(".item_title h1")->htmls()->get(0);
$pageItems = $pageContent->find(".content:eq(0) .page a")->attrs("href")->all();
if ($this->checkAlbumHasDownload($albumName)) {
Log::info("已经下载过了,相册名:" . $albumName);
return;
}
$imageNo = 1;
$this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo);
$pageItems = array_slice($pageItems, 2, count($pageItems) - 3);
foreach ($pageItems as $item) {
$pageContent = $this->getEncodeHtmlContent($this->domainUrl . $item);
$this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo);
}
}
public function parseContent($dir, $pageContent, &$imageNo)
{
if (!is_dir($dir)) {
try {
mkdir($dir);
} catch (Exception $e) {
Log::error($e->getMessage());
return;
}
}
$images = $pageContent->find(".content p img")->getElements();
$user = $pageContent->find(".item_info div a:eq(-1) span")->htmls()->get(0);
dump("user is " . $user);
foreach ($images as $image) {
usleep(random_int(10, 100) * 100);
$imageUrl = $image->getAttribute("src");
$trueImageUrl = "https://www.xiurenji.net" . $imageUrl;
$fileInfo = pathinfo($trueImageUrl);
if (file_exists($dir . DIRECTORY_SEPARATOR . $fileInfo["basename"])) {
rename($dir . DIRECTORY_SEPARATOR . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"]);
$imageNo++;
continue;
}
if (file_exists($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"])) {
rename($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"]);
$imageNo++;
continue;
}
if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"])) {
$imageNo++;
continue;
}
dump($fileInfo);
$attempts = 0;
$content = "";
do {
try {
$curl_handle = curl_init();
curl_setopt($curl_handle, CURLOPT_URL, $trueImageUrl);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 20000);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
curl_setopt($curl_handle, CURLOPT_REFERER, $this->xiurenRootUrl);
curl_setopt($curl_handle, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl_handle, CURLOPT_ENCODING, '');
curl_setopt($curl_handle, CURLOPT_MAXREDIRS, 10);
curl_setopt($curl_handle, CURLOPT_TIMEOUT, 0);
curl_setopt($curl_handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($curl_handle, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($curl_handle, CURLOPT_HTTPHEADER, array(
'authority: www.xiurenji.net',
'pragma: no-cache',
'cache-control: no-cache',
'sec-ch-ua: "Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
'sec-ch-ua-mobile: ?0',
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'sec-ch-ua-platform: "macOS"',
'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
'sec-fetch-site: same-origin',
'sec-fetch-mode: no-cors',
'sec-fetch-dest: image',
'referer: https://www.xiurenji.net/XiuRen/9483.html',
'accept-language: zh-CN,zh;q=0.9',
'cookie: UM_distinctid=17cfa8bea8eb9e-0dd0c6d032d0fc-1c306851-13c680-17cfa8bea8fc85; CNZZDATA1278618868=1505121253-1636283360-%7C1636283360; __51cke__=; ASPSESSIONIDQAQAATSQ=LBLGNPMDHKKMNOPDBCEAPIMH; __tins__20641871=%7B%22sid%22%3A%201636291046220%2C%20%22vd%22%3A%202%2C%20%22expires%22%3A%201636292852634%7D; __51laig__=2'
));
$content = curl_exec($curl_handle);
if ($content === false) {
$le = new Exception("get image has error: " . curl_error($curl_handle));
curl_close($curl_handle);
throw $le;
}
curl_close($curl_handle);
// $content = file_get_contents($trueImageUrl);
} catch (ErrorException|Exception $e) {
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts++;
continue;
}
break;
} while ($attempts < 100);
if ($content != "") {
file_put_contents($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"], $content);
} else {
Log::error("image content is empty " . $trueImageUrl);
}
$imageNo++;
// dump("current imageNo: " . $imageNo);
}
}
public function getEncodeHtmlContent($url)
{
$attempts = 0;
$html = "";
$arrContextOptions = array(
"ssl" => array(
"allow_self_signed" => true,
"verify_peer" => false,
"verify_peer_name" => false,
),
);
do {
try {
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => 'UTF-8',
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 0,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => 'GET',
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_HTTPHEADER => array(
'authority: www.xiurenji.net',
'pragma: no-cache',
'cache-control: no-cache',
'sec-ch-ua: " Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile: ?0',
'upgrade-insecure-requests: 1',
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site: same-origin',
'sec-fetch-mode: navigate',
'sec-fetch-user: ?1',
'sec-fetch-dest: document',
'referer: https://www.xiurenji.net/XiuRen/',
'accept-language: zh-CN,zh;q=0.9',
'cookie: UM_distinctid=177fd93a0ca93c-06b94658d5d337-121a4759-13c680-177fd93a0cbcaf; ASPSESSIONIDCATDQACD=FDPMPCLAMHNCPJFCBLKFLCKH; CNZZDATA1278618868=367774893-1614867004-%7C1625926983; __51cke__=; __tins__20641871=%7B%22sid%22%3A%201625931982756%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201625933829110%7D; __51laig__=7'
),
));
$response = curl_exec($curl);
$error = curl_error($curl);
echo $error;
curl_close($curl);
// echo $response;
// $html = iconv('gb2312','UTF-8//IGNORE', $response);
$html = iconv_gbk_to_uft8($response);
$this->queryNew->setHtml($response);
// $html = $response;
} catch (Exception $e) {
echo $e->getMessage() . "\n";
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts++;
continue;
}
break;
} while ($attempts < 100);
dump("current url: " . $url);
return $this->queryInstance->setHtml($html);
}
private function checkAlbumHasDownload($albumName)
{
$record = ImageRecord::where("name", $albumName)->first();
if ($record != null) {
return true;
}
}
}

+ 146
- 114
app/Services/XiurenjiService.php View File

@ -4,6 +4,7 @@
namespace App\Services; namespace App\Services;
use App\ImageRecord;
use Exception; use Exception;
use ErrorException; use ErrorException;
use Log; use Log;
@ -11,97 +12,124 @@ use QL\QueryList;
class XiurenjiService class XiurenjiService
{ {
public $domainUrl = "https://www.xiurenji.net";
public $xiurenRootUrl = "https://www.xiurenji.net/XiuRen/";
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
// public $rootDir = "/Volumes/Backup/images/xiuren/";
public $rootDir = "/Volumes/intel660p/image/xiuren/";
private $name_dir = [
"xiuren" => [
"path" => "XiuRen/",
"dir" => "xiuren/"
],
"xiaoyu" => [
"path" => "XiaoYu/",
"dir" => "xiaoyu/"
],
"youwu" => [
"path" => "YouWu/",
"dir" => "youwu/"
],
"mygirl" => [
"path" => "MyGirl/",
"dir" => "mygirl/"
],
"huayang" => [
"path" => "HuaYang/",
"dir" => "huayang/"
],
"mfstar" => [
"path" => "MFStar/",
"dir" => "mfstar/"
],
"imiss" => [
"path" => "IMiss/",
"dir" => "imiss/"
]
];
public $domainUrl = "https://www.xiurenb.net/";
public $xiurenRootUrl = "https://www.xiurenb.net/XiuRen/";
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
// public $rootDir = "/Volumes/Backup/images/xiuren/";
public $rootDir = "/Volumes/Crucial X6/Image/xr/";
public $queryInstance; public $queryInstance;
public $queryNew; public $queryNew;
public function __construct() public function __construct()
{ {
$this->queryInstance = QueryList::getInstance(); $this->queryInstance = QueryList::getInstance();
$this->queryNew = new QueryList(); $this->queryNew = new QueryList();
} }
public function scrapeAlbum()
public function scrapeAll() {
foreach ($this->name_dir as $key => $value) {
# code...
dump("current site: " . $key);
$this->scrapeAlbum($key, 20);
}
// $this->scrapeAlbum("xiuren", 20);
// $this->scrapeAlbum("xiaoyu", 20);
// $this->scrapeAlbum("youwu", 20);
// $this->scrapeAlbum("mygirl", 20);
// $this->scrapeAlbum("huayang", 20);
// $this->scrapeAlbum("mfstar", 20);
// $this->scrapeAlbum("imiss", 20);
}
public function scrapeAlbum($path, $num = 20, $startPage = 0)
{ {
$pageSize = 20; $pageSize = 20;
$pageCount = $this->getEncodeHtmlContent("https://www.xiurenji.net/XiuRen/index.html")->find(".page span")->htmls()->get(0);
print_r($pageCount);
$urlPath = $this->name_dir[$path]["path"];
$rootDir = $this->rootDir;
$this->rootDir = $this->rootDir . $this->name_dir[$path]["dir"];
$pageCount = $this->getEncodeHtmlContent("https://www.xiurenb.net/$urlPath/index.html")->find(".page span strong")->htmls()->get(0);
dump("current site item count: " . $pageCount);
if ((int)$pageCount > 0) { if ((int)$pageCount > 0) {
$pageCount = 100;
for ($i = 0; $i <= ceil($pageCount / $pageSize); $i++) {
$pageCount = min($pageCount, $num);
for ($i = $startPage; $i <= ceil($pageCount / $pageSize); $i++) {
$urlSuffix = ""; $urlSuffix = "";
if ($i == 0) { if ($i == 0) {
$urlSuffix = "index.html"; $urlSuffix = "index.html";
} else { } else {
$urlSuffix = "index" . $i . ".html"; $urlSuffix = "index" . $i . ".html";
} }
$this->scrapePageAlbum($this->xiurenRootUrl . $urlSuffix);
// exit;
$this->scrapePageAlbum($this->domainUrl . $urlPath . $urlSuffix);
} }
} }
$this->rootDir = $rootDir;
} }
public function scrapePageAlbum($url) {
public function scrapePageAlbum($url)
{
$pageContent = $this->getEncodeHtmlContent($url); $pageContent = $this->getEncodeHtmlContent($url);
// dump($pageContent);
$items = $pageContent->find(".dan a")->getElements();
$items = $pageContent->find(".i_list a")->getElements();
$i = 0;
foreach ($items as $item) { foreach ($items as $item) {
// $i++;
// if ($i < 18) {
// continue;
// # code...
// }
dump($item->getAttribute("href")); dump($item->getAttribute("href"));
$this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href")); $this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href"));
} }
} }
public function scrapeSingleAlbum($url) {
public function scrapeSingleAlbum($url)
{
Log::info("scrapeSingleAlbum $url"); Log::info("scrapeSingleAlbum $url");
$pageContent = $this->getEncodeHtmlContent($url); $pageContent = $this->getEncodeHtmlContent($url);
$pageSize = 3;
$items = $pageContent->find(".ina p:nth-child(2)")->texts();
$pageItems = $pageContent->find(".page a:eq(-2)")->htmls();
$isSinglePage = false;
$pageCount = 0;
if (count($pageItems) <= 0) {
$isSinglePage = true;
$pageCount = 1;
// dump("this album is error: ". $url);
// Log::error("this album is error: " . $url);
// return;
$albumName = $pageContent->find(".item_title h1")->htmls()->get(0);
$pageItems = $pageContent->find(".content:eq(0) .page a")->attrs("href")->all();
if ($this->checkAlbumHasDownload($albumName)) {
Log::info("已经下载过了,相册名:" . $albumName);
return;
}
dump("当前相册名: " . $albumName);
$imageNo = 1;
$this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo);
$pageItems = array_slice($pageItems, 2, count($pageItems) - 3);
foreach ($pageItems as $item) {
$pageContent = $this->getEncodeHtmlContent($this->domainUrl . $item);
$this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo);
} }
// dump($pageItems);exit;
$item = $items[0];
// $imageNum = substr($item, strrpos($item, "[") + 1, strrpos($item, "P]") - strrpos($item, "[") - 1);
// $pageCount = ceil($imageNum / $pageSize);
$pageCount = $pageCount > 0 ? $pageCount : (int)$pageItems[0];
// dump("pageCount: ". $pageCount . "");exit;
$slashPos = strpos($url, "XiuRen/") + 7;
$dotPos = strrpos($url, ".");
$albumCode = substr($url, $slashPos, $dotPos - $slashPos);
// $albumName0 = substr($item, strpos($item, "["), strrpos($item, "]") - strpos($item, "[") + 1);
$albumName = ltrim(substr($item, 6, strrpos($item, "]") - 5));
if (mb_strlen($albumName) <= 12 || mb_strlen($albumName) >= 50 || !str_contains($albumName, "No")) {
dump("old Album: " . $albumName);
$albumName = $pageContent->find(".ina p b:nth-child(2)")->texts();
// $albumName = ltrim(substr($item, 6, strrpos(substr($item, 0, strrpos($item, "@")), " ") - 5));
dump("new Album: " . urldecode($albumName[0]));
$albumName = urldecode($albumName[0]);
}
$imageNo = 1;
$this->parseContent($this->rootDir. $albumName, $pageContent, $imageNo);
dump("albumName: ". $albumName);
// dump("item: ". $item);
// exit;
for ($i = 1; $i < $pageCount; $i++) {
$pageContent = $this->getEncodeHtmlContent($this->xiurenRootUrl . $albumCode . "_" . $i . ".html");
$this->parseContent($this->rootDir. $albumName, $pageContent, $imageNo);
}
// dump("slashPos: " . $slashPos . " dotPos: " . $dotPos . " albumCode: ". $albumCode);
// dump($item);
// exit;
} }
public function parseContent($dir, $pageContent, &$imageNo) public function parseContent($dir, $pageContent, &$imageNo)
@ -114,43 +142,38 @@ class XiurenjiService
return; return;
} }
} }
$images = $pageContent->find(".img p img")->getElements();
$user = $pageContent->find(".title_pc tr:eq(2) td a:eq(2)")->html();
// $user = $this->queryNew->find(".ina a:eq(-1) b")->html();
// body > div.nr3 > table.title_pc > tbody > tr:nth-child(3) > td > a:nth-child(2)
$user = iconv_gbk_to_uft8($user);
dump("user is " . $user);
if (str_contains($user, "#") || str_contains($user, "&") || trim($user) == "") {
$user = $this->queryNew->find(".ina a:eq(-1) b")->html();;
$user = iconv_gbk_to_uft8($user);
dump("new user is " . $user);
$images = $pageContent->find(".content p img")->getElements();
$user = $pageContent->find(".item_info div a:eq(-1) span")->htmls()->get(0);
if ($imageNo == 1) {
// 只在每个相册第一次输出名字
dump("user is " . $user);
} }
foreach ($images as $image) { foreach ($images as $image) {
usleep(random_int(10, 100) * 100); usleep(random_int(10, 100) * 100);
$imageUrl = $image->getAttribute("src"); $imageUrl = $image->getAttribute("src");
$trueImageUrl = "https://x1.plmn5.com/U". substr($imageUrl, 2);
$trueImageUrl = "https://www.xiurenji.net" . $imageUrl;
$trueImageUrl = "https://www.xiurenji.net" . $imageUrl;
$fileInfo = pathinfo($trueImageUrl); $fileInfo = pathinfo($trueImageUrl);
if (file_exists($dir . "/" .$fileInfo["basename"])) {
rename($dir . "/" . $fileInfo["basename"], $dir . "/" . $imageNo . "-" . $fileInfo["basename"]);
if (file_exists($dir . DIRECTORY_SEPARATOR . $fileInfo["basename"])) {
rename($dir . DIRECTORY_SEPARATOR . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"]);
$imageNo++; $imageNo++;
continue; continue;
} }
if (file_exists($dir . "/" . $imageNo . "-" . $fileInfo["basename"])) {
rename($dir . "/" . $imageNo . "-" . $fileInfo["basename"], $dir . "/" . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"]);
if (file_exists($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"])) {
rename($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"]);
$imageNo++; $imageNo++;
continue; continue;
} }
if (file_exists($dir . "/" . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"])) {
if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"])) {
$imageNo++; $imageNo++;
continue; continue;
} }
dump($fileInfo);
// dump($fileInfo);
$attempts = 0; $attempts = 0;
$content = ""; $content = "";
do { do {
try { try {
$curl_handle=curl_init();
$curl_handle = curl_init();
curl_setopt($curl_handle, CURLOPT_URL, $trueImageUrl); curl_setopt($curl_handle, CURLOPT_URL, $trueImageUrl);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 20000); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 20000);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
@ -162,21 +185,21 @@ class XiurenjiService
curl_setopt($curl_handle, CURLOPT_TIMEOUT, 0); curl_setopt($curl_handle, CURLOPT_TIMEOUT, 0);
curl_setopt($curl_handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); curl_setopt($curl_handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($curl_handle, CURLOPT_CUSTOMREQUEST, 'GET'); curl_setopt($curl_handle, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($curl_handle, CURLOPT_HTTPHEADER, array(
'authority: www.xiurenji.net',
'pragma: no-cache',
'cache-control: no-cache',
'sec-ch-ua: "Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
'sec-ch-ua-mobile: ?0',
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'sec-ch-ua-platform: "macOS"',
'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
'sec-fetch-site: same-origin',
'sec-fetch-mode: no-cors',
'sec-fetch-dest: image',
'referer: https://www.xiurenji.net/XiuRen/9483.html',
'accept-language: zh-CN,zh;q=0.9',
'cookie: UM_distinctid=17cfa8bea8eb9e-0dd0c6d032d0fc-1c306851-13c680-17cfa8bea8fc85; CNZZDATA1278618868=1505121253-1636283360-%7C1636283360; __51cke__=; ASPSESSIONIDQAQAATSQ=LBLGNPMDHKKMNOPDBCEAPIMH; __tins__20641871=%7B%22sid%22%3A%201636291046220%2C%20%22vd%22%3A%202%2C%20%22expires%22%3A%201636292852634%7D; __51laig__=2'
curl_setopt($curl_handle, CURLOPT_HTTPHEADER, array(
'authority: www.xiurenji.net',
'pragma: no-cache',
'cache-control: no-cache',
'sec-ch-ua: "Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
'sec-ch-ua-mobile: ?0',
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'sec-ch-ua-platform: "macOS"',
'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
'sec-fetch-site: same-origin',
'sec-fetch-mode: no-cors',
'sec-fetch-dest: image',
'referer: https://www.xiurenji.net/XiuRen/9483.html',
'accept-language: zh-CN,zh;q=0.9',
'cookie: UM_distinctid=17cfa8bea8eb9e-0dd0c6d032d0fc-1c306851-13c680-17cfa8bea8fc85; CNZZDATA1278618868=1505121253-1636283360-%7C1636283360; __51cke__=; ASPSESSIONIDQAQAATSQ=LBLGNPMDHKKMNOPDBCEAPIMH; __tins__20641871=%7B%22sid%22%3A%201636291046220%2C%20%22vd%22%3A%202%2C%20%22expires%22%3A%201636292852634%7D; __51laig__=2'
)); ));
$content = curl_exec($curl_handle); $content = curl_exec($curl_handle);
if ($content === false) { if ($content === false) {
@ -185,36 +208,36 @@ class XiurenjiService
throw $le; throw $le;
} }
curl_close($curl_handle); curl_close($curl_handle);
// $content = file_get_contents($trueImageUrl);
} catch (ErrorException | Exception $e) {
// $content = file_get_contents($trueImageUrl);
} catch (ErrorException|Exception $e) {
echo $e->getTraceAsString() . "\n"; echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000); $sleepTime = 10000 * random_int(100, 1000);
echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n"; echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n";
usleep($sleepTime); usleep($sleepTime);
$attempts ++;
$attempts++;
continue; continue;
} }
break; break;
} while($attempts < 100);
} while ($attempts < 100);
if ($content != "") { if ($content != "") {
file_put_contents($dir . "/" . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"], $content);
file_put_contents($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"], $content);
} else { } else {
Log::error("image content is empty ". $trueImageUrl);
Log::error("image content is empty " . $trueImageUrl);
} }
$imageNo++; $imageNo++;
// dump("current imageNo: " . $imageNo);
// dump("current imageNo: " . $imageNo);
} }
} }
public function getEncodeHtmlContent($url) {
public function getEncodeHtmlContent($url)
{
$attempts = 0; $attempts = 0;
$html = ""; $html = "";
$arrContextOptions=array(
"ssl"=>array(
"allow_self_signed"=>true,
"verify_peer"=>false,
"verify_peer_name"=>false,
$arrContextOptions = array(
"ssl" => array(
"allow_self_signed" => true,
"verify_peer" => false,
"verify_peer_name" => false,
), ),
); );
do { do {
@ -233,7 +256,7 @@ class XiurenjiService
CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false, CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_HTTPHEADER => array(
CURLOPT_HTTPHEADER => array(
'authority: www.xiurenji.net', 'authority: www.xiurenji.net',
'pragma: no-cache', 'pragma: no-cache',
'cache-control: no-cache', 'cache-control: no-cache',
@ -257,23 +280,32 @@ class XiurenjiService
echo $error; echo $error;
curl_close($curl); curl_close($curl);
// echo $response;
// $html = iconv('gb2312','UTF-8//IGNORE', $response);
// echo $response;
// $html = iconv('gb2312','UTF-8//IGNORE', $response);
$html = iconv_gbk_to_uft8($response); $html = iconv_gbk_to_uft8($response);
$this->queryNew->setHtml($response); $this->queryNew->setHtml($response);
// $html = $response;
// $html = $response;
} catch (Exception $e) { } catch (Exception $e) {
echo $e->getMessage(). "\n";
echo $e->getMessage() . "\n";
echo $e->getTraceAsString() . "\n"; echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000); $sleepTime = 10000 * random_int(100, 1000);
echo "sleep {$sleepTime} nano second \n"; echo "sleep {$sleepTime} nano second \n";
usleep($sleepTime); usleep($sleepTime);
$attempts ++;
$attempts++;
continue; continue;
} }
break; break;
} while($attempts < 100);
} while ($attempts < 100);
// dump("current url: " . $url);
return $this->queryInstance->setHtml($html); return $this->queryInstance->setHtml($html);
} }
private function checkAlbumHasDownload($albumName)
{
$record = ImageRecord::where("name", $albumName)->first();
if ($record != null) {
return true;
}
}
} }

+ 256
- 0
app/Services/YouwuService.php View File

@ -0,0 +1,256 @@
<?php
namespace App\Services;
use App\ImageRecord;
use Exception;
use ErrorException;
use Log;
use QL\QueryList;
class YouwuService
{
public $domainUrl = "https://www.xiurenb.net/";
public $xiurenRootUrl = "https://www.xiurenb.net/YouWu/";
// public $rootDir = "/Users/shixuesen/Documents/tmp/xiuren/";
// public $rootDir = "/Volumes/Backup/images/xiuren/";
public $rootDir = "/Volumes/Crucial X6/Image/youwu/";
public $queryInstance;
public $queryNew;
public function __construct()
{
$this->queryInstance = QueryList::getInstance();
$this->queryNew = new QueryList();
}
public function scrapeAlbum()
{
$pageSize = 20;
$pageCount = $this->getEncodeHtmlContent("https://www.xiurenb.net/YouWu/index.html")->find(".page span strong")->htmls()->get(0);
print_r($pageCount);
if ((int)$pageCount > 0) {
// $pageCount = 20;
for ($i = 0; $i <= ceil($pageCount / $pageSize); $i++) {
$urlSuffix = "";
if ($i == 0) {
$urlSuffix = "index.html";
} else {
$urlSuffix = "index" . $i . ".html";
}
$this->scrapePageAlbum($this->xiurenRootUrl . $urlSuffix);
}
}
}
public function scrapePageAlbum($url)
{
$pageContent = $this->getEncodeHtmlContent($url);
$items = $pageContent->find(".i_list a")->getElements();
$i = 0;
foreach ($items as $item) {
// $i++;
// if ($i < 18) {
// continue;
// # code...
// }
dump($item->getAttribute("href"));
$this->scrapeSingleAlbum($this->domainUrl . $item->getAttribute("href"));
}
}
public function scrapeSingleAlbum($url)
{
Log::info("scrapeSingleAlbum $url");
$pageContent = $this->getEncodeHtmlContent($url);
$albumName = $pageContent->find(".item_title h1")->htmls()->get(0);
$pageItems = $pageContent->find(".content:eq(0) .page a")->attrs("href")->all();
if ($this->checkAlbumHasDownload($albumName)) {
Log::info("已经下载过了,相册名:" . $albumName);
return;
}
$imageNo = 1;
$this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo);
$pageItems = array_slice($pageItems, 2, count($pageItems) - 3);
foreach ($pageItems as $item) {
$pageContent = $this->getEncodeHtmlContent($this->domainUrl . $item);
$this->parseContent($this->rootDir . $albumName, $pageContent, $imageNo);
}
}
public function parseContent($dir, $pageContent, &$imageNo)
{
if (!is_dir($dir)) {
try {
mkdir($dir);
} catch (Exception $e) {
Log::error($e->getMessage());
return;
}
}
$images = $pageContent->find(".content p img")->getElements();
$user = $pageContent->find(".item_info div a:eq(-1) span")->htmls()->get(0);
dump("user is " . $user);
foreach ($images as $image) {
usleep(random_int(10, 100) * 100);
$imageUrl = $image->getAttribute("src");
$trueImageUrl = "https://www.xiurenji.net" . $imageUrl;
$fileInfo = pathinfo($trueImageUrl);
if (file_exists($dir . DIRECTORY_SEPARATOR . $fileInfo["basename"])) {
rename($dir . DIRECTORY_SEPARATOR . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"]);
$imageNo++;
continue;
}
if (file_exists($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"])) {
rename($dir . DIRECTORY_SEPARATOR . $imageNo . "-" . $fileInfo["basename"], $dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"]);
$imageNo++;
continue;
}
if (file_exists($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"])) {
$imageNo++;
continue;
}
dump($fileInfo);
$attempts = 0;
$content = "";
do {
try {
$curl_handle = curl_init();
curl_setopt($curl_handle, CURLOPT_URL, $trueImageUrl);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 20000);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36');
curl_setopt($curl_handle, CURLOPT_REFERER, $this->xiurenRootUrl);
curl_setopt($curl_handle, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl_handle, CURLOPT_ENCODING, '');
curl_setopt($curl_handle, CURLOPT_MAXREDIRS, 10);
curl_setopt($curl_handle, CURLOPT_TIMEOUT, 0);
curl_setopt($curl_handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($curl_handle, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($curl_handle, CURLOPT_HTTPHEADER, array(
'authority: www.xiurenji.net',
'pragma: no-cache',
'cache-control: no-cache',
'sec-ch-ua: "Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
'sec-ch-ua-mobile: ?0',
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'sec-ch-ua-platform: "macOS"',
'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
'sec-fetch-site: same-origin',
'sec-fetch-mode: no-cors',
'sec-fetch-dest: image',
'referer: https://www.xiurenji.net/XiuRen/9483.html',
'accept-language: zh-CN,zh;q=0.9',
'cookie: UM_distinctid=17cfa8bea8eb9e-0dd0c6d032d0fc-1c306851-13c680-17cfa8bea8fc85; CNZZDATA1278618868=1505121253-1636283360-%7C1636283360; __51cke__=; ASPSESSIONIDQAQAATSQ=LBLGNPMDHKKMNOPDBCEAPIMH; __tins__20641871=%7B%22sid%22%3A%201636291046220%2C%20%22vd%22%3A%202%2C%20%22expires%22%3A%201636292852634%7D; __51laig__=2'
));
$content = curl_exec($curl_handle);
if ($content === false) {
$le = new Exception("get image has error: " . curl_error($curl_handle));
curl_close($curl_handle);
throw $le;
}
curl_close($curl_handle);
// $content = file_get_contents($trueImageUrl);
} catch (ErrorException|Exception $e) {
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "wait for $trueImageUrl sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts++;
continue;
}
break;
} while ($attempts < 100);
if ($content != "") {
file_put_contents($dir . DIRECTORY_SEPARATOR . trim($user) . "-" . $imageNo . "-" . $fileInfo["basename"], $content);
} else {
Log::error("image content is empty " . $trueImageUrl);
}
$imageNo++;
// dump("current imageNo: " . $imageNo);
}
}
public function getEncodeHtmlContent($url)
{
$attempts = 0;
$html = "";
$arrContextOptions = array(
"ssl" => array(
"allow_self_signed" => true,
"verify_peer" => false,
"verify_peer_name" => false,
),
);
do {
try {
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => 'UTF-8',
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 0,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => 'GET',
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_HTTPHEADER => array(
'authority: www.xiurenji.net',
'pragma: no-cache',
'cache-control: no-cache',
'sec-ch-ua: " Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile: ?0',
'upgrade-insecure-requests: 1',
'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site: same-origin',
'sec-fetch-mode: navigate',
'sec-fetch-user: ?1',
'sec-fetch-dest: document',
'referer: https://www.xiurenji.net/XiuRen/',
'accept-language: zh-CN,zh;q=0.9',
'cookie: UM_distinctid=177fd93a0ca93c-06b94658d5d337-121a4759-13c680-177fd93a0cbcaf; ASPSESSIONIDCATDQACD=FDPMPCLAMHNCPJFCBLKFLCKH; CNZZDATA1278618868=367774893-1614867004-%7C1625926983; __51cke__=; __tins__20641871=%7B%22sid%22%3A%201625931982756%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201625933829110%7D; __51laig__=7'
),
));
$response = curl_exec($curl);
$error = curl_error($curl);
echo $error;
curl_close($curl);
// echo $response;
// $html = iconv('gb2312','UTF-8//IGNORE', $response);
$html = iconv_gbk_to_uft8($response);
$this->queryNew->setHtml($response);
// $html = $response;
} catch (Exception $e) {
echo $e->getMessage() . "\n";
echo $e->getTraceAsString() . "\n";
$sleepTime = 10000 * random_int(100, 1000);
echo "sleep {$sleepTime} nano second \n";
usleep($sleepTime);
$attempts++;
continue;
}
break;
} while ($attempts < 100);
dump("current url: " . $url);
return $this->queryInstance->setHtml($html);
}
private function checkAlbumHasDownload($albumName)
{
$record = ImageRecord::where("name", $albumName)->first();
if ($record != null) {
return true;
}
}
}

+ 20
- 0
app/Utils/helper.php View File

@ -78,3 +78,23 @@ function iconv_gbk_to_uft8($string){
} }
function scanFilesWithoutPath($path): array
{
$allFiles = [];
if (is_dir($path)) {
$files = scandir($path);
foreach ($files as $file) {
if ($file == "." || $file == "..") {
continue;
}
if (is_dir($path . DIRECTORY_SEPARATOR . $file)) {
$allFiles = array_merge($allFiles, scanFilesWithoutPath($path . DIRECTORY_SEPARATOR . $file));
}
if (is_file($path . DIRECTORY_SEPARATOR . $file)) {
$allFiles[] = $file;
}
}
}
return $allFiles;
}

+ 3
- 0
fail.log View File

@ -2412,3 +2412,6 @@
/Users/shixuesen/OneDrive/Pictures/instagram/duyenn.hipp/ https://scontent-lax3-2.cdninstagram.com/v/t51.2885-15/e35/271023175_690888875412119_539636855243307136_n.jpg?se=7&_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=111&_nc_ohc=uX8kSCWkRdkAX-ESbX3&edm=ABmJApABAAAA&ccb=7-4&ig_cache_key=Mjc0MjEwNDQ2MjAwMzcxNTUyMg%3D%3D.2-ccb7-4&oh=00_AT9u-NUFO26OzvXmNv82rLjcg3LyNuZn047A8eTYPwvzvA&oe=61D964CA&_nc_sid=6136e7 /Users/shixuesen/OneDrive/Pictures/instagram/duyenn.hipp/ https://scontent-lax3-2.cdninstagram.com/v/t51.2885-15/e35/271023175_690888875412119_539636855243307136_n.jpg?se=7&_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=111&_nc_ohc=uX8kSCWkRdkAX-ESbX3&edm=ABmJApABAAAA&ccb=7-4&ig_cache_key=Mjc0MjEwNDQ2MjAwMzcxNTUyMg%3D%3D.2-ccb7-4&oh=00_AT9u-NUFO26OzvXmNv82rLjcg3LyNuZn047A8eTYPwvzvA&oe=61D964CA&_nc_sid=6136e7
/Users/shixuesen/OneDrive/Pictures/instagram/duyenn.hipp/ https://scontent-lax3-2.cdninstagram.com/v/t51.2885-15/e35/271200431_406072771311487_3149765346719126659_n.jpg?se=7&_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=107&_nc_ohc=vmY2J-jCQJQAX9G5WJF&edm=ABmJApABAAAA&ccb=7-4&ig_cache_key=Mjc0MjEwNDQ2MTkxMTM3NzM2MQ%3D%3D.2-ccb7-4&oh=00_AT_AWjgpeZtbY2FimwMPXSsJ-8w3-N5N2d6bo8r_PmxaBg&oe=61D9D733&_nc_sid=6136e7 /Users/shixuesen/OneDrive/Pictures/instagram/duyenn.hipp/ https://scontent-lax3-2.cdninstagram.com/v/t51.2885-15/e35/271200431_406072771311487_3149765346719126659_n.jpg?se=7&_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=107&_nc_ohc=vmY2J-jCQJQAX9G5WJF&edm=ABmJApABAAAA&ccb=7-4&ig_cache_key=Mjc0MjEwNDQ2MTkxMTM3NzM2MQ%3D%3D.2-ccb7-4&oh=00_AT_AWjgpeZtbY2FimwMPXSsJ-8w3-N5N2d6bo8r_PmxaBg&oe=61D9D733&_nc_sid=6136e7
/Users/shixuesen/OneDrive/Pictures/instagram/duyenn.hipp/ https://scontent-lax3-1.cdninstagram.com/v/t51.2885-15/e35/p480x480/271184973_1101596533921696_3101847491200714204_n.jpg?_nc_ht=scontent-lax3-1.cdninstagram.com&_nc_cat=104&_nc_ohc=TOnuDg8jNGoAX9fYYjY&edm=ABmJApABAAAA&ccb=7-4&ig_cache_key=Mjc0MjAwNjE1NzkwMjQ4MjA0MA%3D%3D.2-ccb7-4&oh=00_AT8VY0T2re4YqMfthluidg7dx3aHiyqCWFeHW1js-Sq5RQ&oe=61DA983E&_nc_sid=6136e7 /Users/shixuesen/OneDrive/Pictures/instagram/duyenn.hipp/ https://scontent-lax3-1.cdninstagram.com/v/t51.2885-15/e35/p480x480/271184973_1101596533921696_3101847491200714204_n.jpg?_nc_ht=scontent-lax3-1.cdninstagram.com&_nc_cat=104&_nc_ohc=TOnuDg8jNGoAX9fYYjY&edm=ABmJApABAAAA&ccb=7-4&ig_cache_key=Mjc0MjAwNjE1NzkwMjQ4MjA0MA%3D%3D.2-ccb7-4&oh=00_AT8VY0T2re4YqMfthluidg7dx3aHiyqCWFeHW1js-Sq5RQ&oe=61DA983E&_nc_sid=6136e7
/Users/shixuesen/OneDrive/Pictures/instagram/amandacerny/ https://scontent-lax3-1.cdninstagram.com/v/t50.2886-16/10000000_458426249278269_2831399203059854387_n.mp4?cb=9ad74b5e-c1c39920&efg=eyJ2ZW5jb2RlX3RhZyI6InZ0c192b2RfdXJsZ2VuLjcyMC5jbGlwcy5iYXNlbGluZSJ9&_nc_ht=scontent-lax3-1.cdninstagram.com&_nc_cat=109&_nc_ohc=Ex7g5lOfRAEAX_8S2xn&edm=ABmJApABAAAA&vs=639577407291135_135919037&_nc_vs=HBksFQAYJEdJQ1dtQUE5Yzc2dDc2QUJBRFBzdE9PU0prc25icV9FQUFBRhUAAsgBABUAGCRHTmNkTnhBQlBMcFdiUW9CQUdIZnhIUnU4MXQwYnFfRUFBQUYVAgLIAQAoABgAGwAVAAAm3ry53ovKyj8VAigCQzMsF0BNHdLxqfvnGBJkYXNoX2Jhc2VsaW5lXzFfdjERAHX%2BBwA%3D&_nc_rid=4bfbf6bc16&ccb=7-4&oe=61EF5C3A&oh=00_AT9EwLhWcU21qFR93E_OXz6-OqMPgCHplrGD5F_Exc6Z8Q&_nc_sid=6136e7
/Users/shixuesen/OneDrive/Pictures/instagram/parlovetati/ https://scontent-lax3-2.cdninstagram.com/o1/v/t16/f1/m38/3C46336060C0E88EF14A62867DEF31A7_video_dashinit.mp4?efg=eyJ2ZW5jb2RlX3RhZyI6InZ0c192b2RfdXJsZ2VuLjcyMC5zdG9yeS5iYXNlbGluZW9pbCJ9&_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=106&vs=1373815423049090_3319206028&_nc_vs=HBksFQIYRGlnX3hwdl9lcGhlbWVyYWwvM0M0NjMzNjA2MEMwRTg4RUYxNEE2Mjg2N0RFRjMxQTdfdmlkZW9fZGFzaGluaXQubXA0FQACyAEAFQAYJEdQVktQUkFYc1FZRG9lc0FBTEs4UDNaYmcxTlZicGt3QUFBRhUCAsgBACgAGAAbAYgHdXNlX29pbAExFQAAJt6%2B16WA0%2Bo%2FFQIoAkMzLBdABzMzMzMzMxgVZGFzaF9iYXNlbGluZW9pbF8xX3YxEQB16AcA&_nc_rid=d8368805af&cb=9ad74b5e-c1c39920&ccb=9-4&oe=61F1818A&oh=00_AT-vnjS9ihqsdV079FI6YMHhqJ_eosGkWn3E222yC0ijrg&_nc_sid=bab638
/Users/shixuesen/OneDrive/Pictures/instagram/Likes/ https://scontent-lax3-1.cdninstagram.com/v/t51.2885-15/e35/274209204_124732130092305_602221022598421405_n.jpg?_nc_ht=scontent-lax3-1.cdninstagram.com&_nc_cat=109&_nc_ohc=X96gve0P_9MAX_S7q4S&edm=AJ9x6zYBAAAA&ccb=7-4&ig_cache_key=Mjc3NTExNDIxNTkwMjUxNTAxOQ%3D%3D.2-ccb7-4&oh=00_AT9YuSvddWiOn7hDtogr1XboXsovLzCcnc3zKMNRWe0iBA&oe=6214C58B&_nc_sid=cff2a4

Loading…
Cancel
Save