[ "dir" => "xq", "code" => 22204, "name" => "小琪" ], "ygh" => [ "dir" => "ygh", "code" => 15902, "name" => "原干惠" ], "ycc" => [ "dir" => "ycc", "code" => 22162, "name" => "杨晨晨" ], "wyc" => [ "dir" => "wyc", "code" => 19702, "name" => "王语纯" ], "zz" => [ "dir" => "zz", "code" => 22899, "name" => "芝芝 booty" ], "hlr" => [ "dir" => "hlr", "code" => 20015, "name" => "黄乐然" ], "jrq" => [ "dir" => "jrq", "code" => 26560, "name" => "姜仁卿" ], "ny" => [ "dir" => "ny", "code" => 26298, "name" => "奈月" ] ]; public function scrapeXiuGirls() { // 15902 原干惠 // 22162 杨晨晨 // 19702 王语纯 // 22899 芝芝 booty // 20015 黄乐然 // 26560 姜仁卿 // print_r(self::$name_dir);exit; $NUM_OF_ATTEMPTS = 5; foreach (self::$name_dir as $name) { usleep(random_int(1000, 10000) * 1000); $peopleUrl = "https://www.xsnvshen.com/girl/"; $peopleUrl .= $name['code']; $albumSelector = ".entryAblum > .star-mod-bd > ul > li > a"; $baseQl = QueryList::get($peopleUrl); $ql = $baseQl->find($albumSelector)->attrs("href"); $items = $ql->all(); // $items = array_slice($items, 49); // print_r($items);exit; $baseUrl = "https://www.xsnvshen.com"; // $baseDir = "/Users/shixuesen/Documents/xg/" . $name['dir']. "/"; $baseDir = "/Volumes/intel660p/image/xg/" . $name['dir'] . "/"; // $items = array_slice($items, 1); // print_r($items);exit; foreach ($items as $item) { echo "相册子链接: " .$item . "\n"; $html = (new \QL\QueryList)->get($baseUrl. $item); $title = $html->find("h1 > a")->texts(); echo "相册名: " . $title[0] . "\n"; while (trim($title[0]) == "古诗文") { echo "here error happenned \n"; // dump($html->getHtml()); usleep(random_int(1, 1000) * 50000); $html = QueryList::get($baseUrl. $item); $title = $html->find("h1 > a")->texts(); // continue; } // break; $albumPath = ""; if (!file_exists($baseDir .explode("/", $item)[2] . "-" .$title[0])) { mkdir($baseDir .explode("/", $item)[2]. "-" .$title[0]); } $albumPath = $baseDir.explode("/", $item)[2] . "-" . $title[0]; $attempts = 0; $images = []; do { try { $images = QueryList::get($baseUrl . $item)->find(".swi-hd > img")->attrs("src"); } catch (\Exception $e) { echo $e->getTraceAsString() . "\n"; $sleepTime = 10000 * random_int(100, 1000); echo "sleep {$sleepTime} nano second \n"; usleep($sleepTime); $attempts ++; continue; } break; } while ($attempts <= $NUM_OF_ATTEMPTS); foreach ($images as $image) { if (file_exists($albumPath."/".pathinfo("http:".$image)['filename'].".jpg")) { continue; } $opts = array('http'=> ['header' => "User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n Referer:".$baseUrl.$item."\r\n" ]); $curl_handle=curl_init(); curl_setopt($curl_handle, CURLOPT_URL,"https:".$image); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'); curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item); $query = curl_exec($curl_handle); $i = 1; while ($query === false) { echo 'Curl error: ' . curl_error($curl_handle) ."\n"; echo "retry times: " .$i++ ." times \n"; sleep(1); $query = curl_exec($curl_handle); if ($i >= 100) { break; } } echo curl_error($curl_handle); $fp = fopen($albumPath."/".pathinfo("http:".$image)['filename'].".jpg", 'x'); fwrite($fp, $query); fclose($fp); $sleepTime = 1000 * random_int(100, 1000); echo "after write image sleep {$sleepTime} nano second \n"; usleep($sleepTime); } usleep(1000 * random_int(100, 1000)); } } exit; $baseUrl = "https://www.xsnvshen.com/girl/22162"; $albumSelector = ".entryAblum > .star-mod-bd > ul > li > a"; $baseQl = QueryList::get($baseUrl); $ql = $baseQl->find($albumSelector)->attrs("href"); // /* // new dir ---- $titles = $baseQl->find($albumSelector)->attrs("*"); // print_r($titles->all());exit; $baseDir = "/Users/shixuesen/Documents/xg/ycc/"; foreach ($titles->all() as $item) { if (file_exists($baseDir .explode("/", $item['href'])[2])) { echo "old name :".$baseDir .explode("/", $item['href'])[2] . "\n"; rename($baseDir.explode("/", $item['href'])[2], $baseDir.explode("/", $item['href'])[2] ."-".$item['title']); } } exit; // new dir ---- // */ $items = $ql->all(); // $items = array_slice($items, 49); // print_r($items);exit; $baseUrl = "https://www.xsnvshen.com"; $baseDir = "/Users/shixuesen/Documents/xg/ycc/"; // $items = array_slice($items, 1); // print_r($items);exit; foreach ($items as $item) { print_r($item); $html = QueryList::get($baseUrl. $item); $title = $html->find("h1 > a")->texts(); print_r($title); // print_r(is_dir($baseDir));exit; $albumPath = ""; // if (count($title) >0 ) { // if (!file_exists($baseDir .$title[0])) { // mkdir($baseDir .$title[0]); // } // $albumPath = $baseDir.$title[0]; // } else { if (!file_exists($baseDir .explode("/", $item)[2])) { mkdir($baseDir .explode("/", $item)[2]); } $albumPath = $baseDir.explode("/", $item)[2]; // } $images = QueryList::get($baseUrl . $item)->find(".swi-hd > img")->attrs("src"); // mkdir("") // print_r($images); foreach ($images as $image) { if (file_exists($albumPath."/".pathinfo("http:".$image)['filename'].".jpg")) { continue; // unlink($albumPath."/".pathinfo("http:".$image)['filename'].".jpg"); } $opts = array('http'=> ['header' => "User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 \r\n Referer:".$baseUrl.$item."\r\n" ]); $curl_handle=curl_init(); curl_setopt($curl_handle, CURLOPT_URL,"https:".$image); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2000); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'); curl_setopt($curl_handle, CURLOPT_REFERER, $baseUrl . $item); $query = curl_exec($curl_handle); $i = 1; while ($query === false) { echo 'Curl error: ' . curl_error($curl_handle) ."\n"; echo "retry times: " .$i++ ." times \n"; sleep(1); $query = curl_exec($curl_handle); if ($i >= 100) { break; } } echo curl_error($curl_handle); $fp = fopen($albumPath."/".pathinfo("http:".$image)['filename'].".jpg", 'x'); fwrite($fp, $query); fclose($fp); // file_put_contents("1.jpg", $query); // curl_close($curl_handle); // exit; // $context = stream_context_create($opts); // $a = file_get_contents("http:".$image, false, $context);exit; // Storage::put($item.pathinfo($image)["filename"], file_get_contents("http:".$image, false, $context)); } // exit; } } }