You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

281 lines
11 KiB

<?php
namespace App\Services;
use Illuminate\Support\Arr;
use Illuminate\Support\Facades\Log;
use InstagramAPI\Response\Model\Item;
class Ins24Service
{
function logFailUrl($filePrefix, $fileUrl)
{
$failLogFile = "fail.log";
$myfile = file_put_contents($failLogFile, $filePrefix . "\t" . $fileUrl . PHP_EOL, FILE_APPEND | LOCK_EX);
}
public function scrapeUsers()
{
$baseImageDir = "/Users/shixuesen/OneDrive/Pictures/instagram/";
try {
$userList = file("/Users/shixuesen/OneDrive/Pictures/instagram/user.txt");
$userList = array_slice($userList, 1);
// $userList = ['1992.ai_'];
// print_r($userList);exit;
// print_r($userList);
foreach ($userList as $userName) {
usleep(random_int(10, 1000) * 100000);
$trueName = trim($userName);
$thisUserImageDir = $baseImageDir . $trueName . "/";
$maxId = "";
$pageNo = 1;
$flag = true;
for (; ;) {
try {
$data = $this->queryImage($trueName, $pageNo, $maxId);
if (count($data["resourceList"]) == 0) {
break;
}
if ($flag && $data["maxId"] != "2263028000987470356_13049295449") {
echo "maxId not matched! \n";
$flag = false;
continue;
}
foreach ($data["resourceList"] as $resource) {
$res = $this->downloadFile($resource["mediaURL"], 0, $thisUserImageDir);
usleep(random_int(10, 1000) * 100000);
if ($res == 0) {
break;
}
}
$maxId = $data["maxId"];
$pageNo ++;
} catch (\Exception $e) {
Log::error("ins get user id for name error: " . $e->getMessage() . " username is " . $userName);
}
}
}
} catch (\Exception $e) {
}
}
public function scrapeUsersVideos()
{
$baseImageDir = "/Users/shixuesen/OneDrive/Pictures/instagram/";
try {
$userList = file("/Users/shixuesen/OneDrive/Pictures/instagram/user.txt");
// $userList = array_slice($userList, 24);
// $userList = ['1992.ai_'];
// print_r($userList);exit;
// print_r($userList);
foreach ($userList as $userName) {
usleep(random_int(10, 1000) * 100000);
$trueName = trim($userName);
$thisUserImageDir = $baseImageDir . $trueName . "/";
$maxId = "";
$pageNo = 1;
for (; ;) {
try {
$data = $this->queryVideo($trueName, $pageNo, $maxId);
if (count($data["resourceList"]) == 0) {
break;
}
foreach ($data["resourceList"] as $resource) {
$res = $this->downloadFile($resource["videoUrl"], 0, $thisUserImageDir);
usleep(random_int(10, 1000) * 100000);
if ($res == 0) {
break 2;
}
}
$maxId = $data["maxId"];
$pageNo ++;
} catch (\Exception $e) {
Log::error("ins get user id for name error: " . $e->getMessage() . " username is " . $userName);
}
}
}
} catch (\Exception $e) {
}
}
private function queryImage($userName, $pageNo = 1, $maxId = "") {
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => "http://www.ins246.com/ins/search.html",
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => "",
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 0,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => "POST",
// CURLOPT_POSTFIELDS => "search=https%3A%2F%2Fwww.instagram.com%2Fnz0502%2F&pageNo=1&type=1&maxId=",
CURLOPT_POSTFIELDS => "search=https://www.instagram.com/$userName/&pageNo=$pageNo&type=1&maxId=$maxId",
CURLOPT_HTTPHEADER => array(
"authority: www.ins246.com",
"pragma: no-cache",
"cache-control: no-cache",
"accept: */*",
"x-requested-with: XMLHttpRequest",
"user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
"content-type: application/x-www-form-urlencoded; charset=UTF-8",
"origin: http://www.ins246.com",
"sec-fetch-site: same-origin",
"sec-fetch-mode: cors",
"sec-fetch-dest: empty",
"referer: https://www.ins246.com/ins/gosearch.html",
"accept-language: en-US,en;q=0.9,zh-TW;q=0.8,zh;q=0.7,ja;q=0.6,zh-CN;q=0.5",
"cookie: __cfduid=d94e7974e4bbec11a34d72efca94204591604573518; JSESSIONID=EF65EB47872FB6EFEFADA05C8CD140EA; _ga=GA1.2.343334980.1604573522; _gid=GA1.2.1674010257.1604573522; JSESSIONID=6A642D962A119268816A05B9B3D7C8A0"
),
));
$response = curl_exec($curl);
if(curl_exec($curl) === false)
{
echo 'Curl error: ' . curl_error($curl);
} else {
echo $response;
}
curl_close($curl);
$formatResponse = json_decode($response, true);
if ($formatResponse["code"] == 200) {
return $formatResponse["data"];
} else {
echo $response;
throw new \Exception("出现异常");
}
// echo $response;
}
private function queryVideo($userName, $pageNo = 1, $maxId = "") {
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => "http://www.ins246.com/ins/search.html",
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => "",
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 0,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => "POST",
// CURLOPT_POSTFIELDS => "search=https%3A%2F%2Fwww.instagram.com%2Fnz0502%2F&pageNo=1&type=1&maxId=",
CURLOPT_POSTFIELDS => "search=https://www.instagram.com/$userName/&pageNo=$pageNo&type=0&maxId=$maxId",
CURLOPT_HTTPHEADER => array(
"authority: www.ins246.com",
"pragma: no-cache",
"cache-control: no-cache",
"accept: */*",
"x-requested-with: XMLHttpRequest",
"user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
"content-type: application/x-www-form-urlencoded; charset=UTF-8",
"origin: http://www.ins246.com",
"sec-fetch-site: same-origin",
"sec-fetch-mode: cors",
"sec-fetch-dest: empty",
"referer: https://www.ins246.com/ins/gosearch.html",
"accept-language: en-US,en;q=0.9,zh-TW;q=0.8,zh;q=0.7,ja;q=0.6,zh-CN;q=0.5",
"cookie: __cfduid=d94e7974e4bbec11a34d72efca94204591604573518; JSESSIONID=EF65EB47872FB6EFEFADA05C8CD140EA; _ga=GA1.2.343334980.1604573522; _gid=GA1.2.1674010257.1604573522; JSESSIONID=6A642D962A119268816A05B9B3D7C8A0"
),
));
$response = curl_exec($curl);
if(curl_exec($curl) === false)
{
echo 'Curl error: ' . curl_error($curl);
} else {
echo $response;
}
curl_close($curl);
$formatResponse = json_decode($response, true);
if ($formatResponse["code"] == 200) {
return $formatResponse["data"];
} else {
echo $response;
throw new \Exception("出现异常");
}
// echo $response;
}
function downloadFile($filenameUrl, $flag = 0, $filePrefix = "")
{
//echo $filenameUrl;exit;
$filePathInfo = pathinfo($filenameUrl);
$filename = $filePathInfo['filename'];
// echo $filename;
// echo "\n";
$pos = strpos($filename, "?");
// echo "pos: ".$pos;
// echo "\n";
if ($pos > 0) {
$filename = substr($filename, 0, $pos);
}
// echo $filename;exit;
// if ($filename == "33020038_640464766303508_27725890796388352_n.jpg"){
// $flag = 1;
// }
if (file_exists($filePrefix . $filename) || file_exists($filePrefix . $filename . ".back")) {
echo "\n file exists " . $filePrefix . $filename;
return 0;
}
try {
$cn_match = "https://www.ins246.com/ins/gosearch.html";
$options = array(
'ssl' => array(
'verify_peer' => true,
'cafile' => "/Users/shixuesen/Downloads/cacert.pem",
'ciphers' => 'HIGH:TLSv1.2:TLSv1.1:TLSv1.0:!SSLv3:!SSLv2',
'CN_match' => $cn_match,
'disable_compression' => true,
)
);
$context = stream_context_create($options);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $filenameUrl);
curl_setopt($ch, CURLOPT_VERBOSE, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_AUTOREFERER, false);
curl_setopt($ch, CURLOPT_REFERER, "http://www.ins246.com/ins/gosearch.html");
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($ch, CURLOPT_HEADER, 0);
$image = curl_exec($ch);
curl_close($ch);
// $image = file_get_contents($filenameUrl, false, null);
} catch (\Throwable $e) {
var_dump($e->getMessage());
$this->logFailUrl($filePrefix, $filenameUrl);
return -1;
}
echo "new filename: " . $filePrefix . $filename . "\n";
$downloadResult = file_put_contents($filePrefix . $filename, $image);
if (!$downloadResult) {
$this->logFailUrl($filePrefix, $filenameUrl);
return -1;
} else {
return 1;
}
}
}