<?php
|
|
|
|
|
|
namespace App\Services;
|
|
|
|
|
|
use Illuminate\Support\Arr;
|
|
use Illuminate\Support\Facades\Log;
|
|
use InstagramAPI\Response\Model\Item;
|
|
|
|
class Ins24Service
|
|
{
|
|
|
|
function logFailUrl($filePrefix, $fileUrl)
|
|
{
|
|
$failLogFile = "fail.log";
|
|
$myfile = file_put_contents($failLogFile, $filePrefix . "\t" . $fileUrl . PHP_EOL, FILE_APPEND | LOCK_EX);
|
|
}
|
|
|
|
public function scrapeUsers()
|
|
{
|
|
|
|
$baseImageDir = "/Users/shixuesen/OneDrive/Pictures/instagram/";
|
|
|
|
try {
|
|
|
|
$userList = file("/Users/shixuesen/OneDrive/Pictures/instagram/user.txt");
|
|
$userList = array_slice($userList, 27, 1);
|
|
// $userList = ['1992.ai_'];
|
|
print_r($userList);exit;
|
|
// print_r($userList);
|
|
foreach ($userList as $userName) {
|
|
$sleepTime = random_int(10, 1000) * 10000;
|
|
echo "currentTime: " . date("y-m-d H:i:s") . " sleep: " .$sleepTime;
|
|
usleep($sleepTime);
|
|
$trueName = trim($userName);
|
|
|
|
$thisUserImageDir = $baseImageDir . $trueName . "/";
|
|
$maxId = "2034758748461248535_13049295449";
|
|
$pageNo = 75;
|
|
$maxId = "";
|
|
$pageNo = 0;
|
|
$flag = true;
|
|
for (; ;) {
|
|
try {
|
|
$data = $this->queryImage($trueName, $pageNo, $maxId);
|
|
$maxId = $data["maxId"];
|
|
$pageNo ++;
|
|
if (count($data["resourceList"]) == 0) {
|
|
break;
|
|
}
|
|
// if ($flag && $data["maxId"] != "2263028000987470356_13049295449") {
|
|
// echo "maxId not matched! \n";
|
|
// $flag = false;
|
|
// continue;
|
|
// }
|
|
foreach ($data["resourceList"] as $resource) {
|
|
$res = $this->downloadFile($resource["mediaURL"], 0, $thisUserImageDir);
|
|
usleep(random_int(10, 1000) * 100000);
|
|
if ($res == 0) {
|
|
break 2;
|
|
}
|
|
}
|
|
} catch (\Exception $e) {
|
|
Log::error("ins get user id for name error: " . $e->getMessage() . " username is " . $userName);
|
|
}
|
|
}
|
|
}
|
|
} catch (\Exception $e) {
|
|
|
|
}
|
|
}
|
|
|
|
public function scrapeUsersVideos()
|
|
{
|
|
|
|
$baseImageDir = "/Users/shixuesen/OneDrive/Pictures/instagram/";
|
|
|
|
try {
|
|
|
|
$userList = file("/Users/shixuesen/OneDrive/Pictures/instagram/user.txt");
|
|
// $userList = array_slice($userList, 24);
|
|
// $userList = ['1992.ai_'];
|
|
// print_r($userList);exit;
|
|
// print_r($userList);
|
|
foreach ($userList as $userName) {
|
|
$sleepTime = random_int(10, 1000) * 10000;
|
|
echo "currentTime: " . date("y-m-d H:i:s") . " sleep: " .$sleepTime;
|
|
// usleep(random_int(10, 1000) * 100000);
|
|
$trueName = trim($userName);
|
|
|
|
$thisUserImageDir = $baseImageDir . $trueName . "/";
|
|
$maxId = "";
|
|
$pageNo = 1;
|
|
for (; ;) {
|
|
try {
|
|
$data = $this->queryVideo($trueName, $pageNo, $maxId);
|
|
$maxId = $data["maxId"];
|
|
$pageNo ++;
|
|
if (count($data["resourceList"]) == 0) {
|
|
break;
|
|
}
|
|
foreach ($data["resourceList"] as $resource) {
|
|
$res = $this->downloadFile($resource["videoUrl"], 0, $thisUserImageDir);
|
|
usleep(random_int(10, 1000) * 100000);
|
|
if ($res == 0) {
|
|
break 2;
|
|
}
|
|
}
|
|
$maxId = $data["maxId"];
|
|
$pageNo ++;
|
|
} catch (\Exception $e) {
|
|
Log::error("ins get user id for name error: " . $e->getMessage() . " username is " . $userName);
|
|
}
|
|
}
|
|
}
|
|
} catch (\Exception $e) {
|
|
|
|
}
|
|
}
|
|
|
|
private function queryImage($userName, $pageNo = 1, $maxId = "") {
|
|
|
|
|
|
$curl = curl_init();
|
|
|
|
curl_setopt_array($curl, array(
|
|
CURLOPT_URL => "http://www.ins246.com/ins/search.html",
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_ENCODING => "",
|
|
CURLOPT_MAXREDIRS => 10,
|
|
CURLOPT_TIMEOUT => 200,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
|
|
CURLOPT_CUSTOMREQUEST => "POST",
|
|
// CURLOPT_POSTFIELDS => "search=https%3A%2F%2Fwww.instagram.com%2Fnz0502%2F&pageNo=1&type=1&maxId=",
|
|
CURLOPT_POSTFIELDS => "search=https://www.instagram.com/$userName/&pageNo=$pageNo&type=1&maxId=$maxId",
|
|
CURLOPT_HTTPHEADER => array(
|
|
"authority: www.ins246.com",
|
|
"pragma: no-cache",
|
|
"cache-control: no-cache",
|
|
"accept: */*",
|
|
"x-requested-with: XMLHttpRequest",
|
|
"user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
|
|
"content-type: application/x-www-form-urlencoded; charset=UTF-8",
|
|
"origin: http://www.ins246.com",
|
|
"sec-fetch-site: same-origin",
|
|
"sec-fetch-mode: cors",
|
|
"sec-fetch-dest: empty",
|
|
"referer: https://www.ins246.com/ins/gosearch.html",
|
|
"accept-language: en-US,en;q=0.9,zh-TW;q=0.8,zh;q=0.7,ja;q=0.6,zh-CN;q=0.5",
|
|
"cookie: __cfduid=d94e7974e4bbec11a34d72efca94204591604573518; JSESSIONID=EF65EB47872FB6EFEFADA05C8CD140EA; _ga=GA1.2.343334980.1604573522; _gid=GA1.2.1674010257.1604573522; JSESSIONID=6A642D962A119268816A05B9B3D7C8A0"
|
|
),
|
|
));
|
|
|
|
$response = curl_exec($curl);
|
|
if(curl_exec($curl) === false)
|
|
{
|
|
echo 'Curl error: ' . curl_error($curl);
|
|
// exit;
|
|
}
|
|
curl_close($curl);
|
|
$formatResponse = json_decode($response, true);
|
|
if ($formatResponse["code"] == 200) {
|
|
echo "userName: $userName, maxId: " . $formatResponse["data"]["maxId"] . " pageNo: " . $pageNo . " count: " . count($formatResponse["data"]["resourceList"]) . "\n";
|
|
return $formatResponse["data"];
|
|
} else {
|
|
echo $response;
|
|
throw new \Exception("出现异常");
|
|
}
|
|
// echo $response;
|
|
|
|
}
|
|
|
|
private function queryVideo($userName, $pageNo = 1, $maxId = "") {
|
|
|
|
|
|
$curl = curl_init();
|
|
|
|
curl_setopt_array($curl, array(
|
|
CURLOPT_URL => "http://www.ins246.com/ins/search.html",
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_ENCODING => "",
|
|
CURLOPT_MAXREDIRS => 10,
|
|
CURLOPT_TIMEOUT => 200,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
|
|
CURLOPT_CUSTOMREQUEST => "POST",
|
|
// CURLOPT_POSTFIELDS => "search=https%3A%2F%2Fwww.instagram.com%2Fnz0502%2F&pageNo=1&type=1&maxId=",
|
|
CURLOPT_POSTFIELDS => "search=https://www.instagram.com/$userName/&pageNo=$pageNo&type=0&maxId=$maxId",
|
|
CURLOPT_HTTPHEADER => array(
|
|
"authority: www.ins246.com",
|
|
"pragma: no-cache",
|
|
"cache-control: no-cache",
|
|
"accept: */*",
|
|
"x-requested-with: XMLHttpRequest",
|
|
"user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
|
|
"content-type: application/x-www-form-urlencoded; charset=UTF-8",
|
|
"origin: http://www.ins246.com",
|
|
"sec-fetch-site: same-origin",
|
|
"sec-fetch-mode: cors",
|
|
"sec-fetch-dest: empty",
|
|
"referer: https://www.ins246.com/ins/gosearch.html",
|
|
"accept-language: en-US,en;q=0.9,zh-TW;q=0.8,zh;q=0.7,ja;q=0.6,zh-CN;q=0.5",
|
|
"cookie: __cfduid=d94e7974e4bbec11a34d72efca94204591604573518; JSESSIONID=EF65EB47872FB6EFEFADA05C8CD140EA; _ga=GA1.2.343334980.1604573522; _gid=GA1.2.1674010257.1604573522; JSESSIONID=6A642D962A119268816A05B9B3D7C8A0"
|
|
),
|
|
));
|
|
|
|
$response = curl_exec($curl);
|
|
if(curl_exec($curl) === false)
|
|
{
|
|
echo 'Curl error: ' . curl_error($curl);
|
|
// exit;
|
|
}
|
|
curl_close($curl);
|
|
$formatResponse = json_decode($response, true);
|
|
if ($formatResponse["code"] == 200) {
|
|
echo "userName: $userName, maxId: " . $formatResponse["data"]["maxId"] . " pageNo: " . $pageNo . " count: " . count($formatResponse["data"]["resourceList"]) . "\n";
|
|
return $formatResponse["data"];
|
|
} else {
|
|
echo $response;
|
|
throw new \Exception("出现异常");
|
|
}
|
|
// echo $response;
|
|
|
|
}
|
|
|
|
function downloadFile($filenameUrl, $flag = 0, $filePrefix = "")
|
|
{
|
|
//echo $filenameUrl;exit;
|
|
$filePathInfo = pathinfo($filenameUrl);
|
|
$filename = $filePathInfo['filename'];
|
|
// echo $filename;
|
|
// echo "\n";
|
|
$pos = strpos($filename, "?");
|
|
// echo "pos: ".$pos;
|
|
// echo "\n";
|
|
if ($pos > 0) {
|
|
$filename = substr($filename, 0, $pos);
|
|
}
|
|
// echo $filename;exit;
|
|
// if ($filename == "33020038_640464766303508_27725890796388352_n.jpg"){
|
|
// $flag = 1;
|
|
// }
|
|
if (file_exists($filePrefix . $filename) || file_exists($filePrefix . $filename . ".back")) {
|
|
echo "\n file exists " . $filePrefix . $filename . "\n";
|
|
return 0;
|
|
}
|
|
try {
|
|
$cn_match = "https://www.ins246.com/ins/gosearch.html";
|
|
$options = array(
|
|
'ssl' => array(
|
|
'verify_peer' => true,
|
|
'cafile' => "/Users/shixuesen/Downloads/cacert.pem",
|
|
'ciphers' => 'HIGH:TLSv1.2:TLSv1.1:TLSv1.0:!SSLv3:!SSLv2',
|
|
'CN_match' => $cn_match,
|
|
'disable_compression' => true,
|
|
)
|
|
);
|
|
$context = stream_context_create($options);
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_URL, $filenameUrl);
|
|
curl_setopt($ch, CURLOPT_VERBOSE, 1);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($ch, CURLOPT_AUTOREFERER, false);
|
|
curl_setopt($ch, CURLOPT_REFERER, "http://www.ins246.com/ins/gosearch.html");
|
|
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
|
|
curl_setopt($ch, CURLOPT_HEADER, 0);
|
|
$image = curl_exec($ch);
|
|
curl_close($ch);
|
|
// $image = file_get_contents($filenameUrl, false, null);
|
|
} catch (\Throwable $e) {
|
|
var_dump($e->getMessage());
|
|
$this->logFailUrl($filePrefix, $filenameUrl);
|
|
return -1;
|
|
}
|
|
|
|
echo "new filename: " . $filePrefix . $filename . "\n";
|
|
$downloadResult = file_put_contents($filePrefix . $filename, $image);
|
|
if (!$downloadResult) {
|
|
$this->logFailUrl($filePrefix, $filenameUrl);
|
|
return -1;
|
|
} else {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
|
|
}
|