| @ -0,0 +1,160 @@ | |||||
| <?php | |||||
| namespace App\Services; | |||||
| use Illuminate\Support\Arr; | |||||
| use Illuminate\Support\Facades\Log; | |||||
| use InstagramAPI\Response\Model\Item; | |||||
| class Ins24Service | |||||
| { | |||||
| public function scrapeUsers() | |||||
| { | |||||
| $baseImageDir = "/Users/shixuesen/OneDrive/Pictures/instagram/"; | |||||
| try { | |||||
| $userList = file("/Users/shixuesen/OneDrive/Pictures/instagram/user.txt"); | |||||
| // $userList = array_slice($userList, 17); | |||||
| // $userList = ['1992.ai_']; | |||||
| // print_r($userList);exit; | |||||
| // print_r($userList); | |||||
| foreach ($userList as $userName) { | |||||
| usleep(random_int(100, 1000) * 10000); | |||||
| $trueName = trim($userName); | |||||
| $thisUserImageDir = $baseImageDir . $trueName . "/"; | |||||
| $maxId = ""; | |||||
| $pageNo = 1; | |||||
| for (; ;) { | |||||
| try { | |||||
| $data = $this->queryImage($trueName, $pageNo, $maxId); | |||||
| foreach ($data["resourceList"] as $resource) { | |||||
| $res = $this->downloadFile($resource["mediaURL"], 0, $thisUserImageDir); | |||||
| usleep(random_int(100, 1000) * 10000); | |||||
| if ($res == 0) { | |||||
| break 2; | |||||
| } | |||||
| } | |||||
| $maxId = $data["maxId"]; | |||||
| $pageNo ++; | |||||
| } catch (\Exception $e) { | |||||
| Log::error("ins get user id for name error: " . $e->getMessage() . " username is " . $userName); | |||||
| } | |||||
| } | |||||
| } | |||||
| } catch (\Exception $e) { | |||||
| } | |||||
| } | |||||
| private function queryImage($userName, $pageNo = 1, $maxId = "") { | |||||
| $curl = curl_init(); | |||||
| curl_setopt_array($curl, array( | |||||
| CURLOPT_URL => "https://www.ins246.com/ins/search.html", | |||||
| CURLOPT_RETURNTRANSFER => true, | |||||
| CURLOPT_ENCODING => "", | |||||
| CURLOPT_MAXREDIRS => 10, | |||||
| CURLOPT_TIMEOUT => 0, | |||||
| CURLOPT_FOLLOWLOCATION => true, | |||||
| CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, | |||||
| CURLOPT_CUSTOMREQUEST => "POST", | |||||
| // CURLOPT_POSTFIELDS => "search=https%3A%2F%2Fwww.instagram.com%2Fnz0502%2F&pageNo=1&type=1&maxId=", | |||||
| CURLOPT_POSTFIELDS => "search=https://www.instagram.com/$userName/&pageNo=$pageNo&type=1&maxId=$maxId", | |||||
| CURLOPT_HTTPHEADER => array( | |||||
| "authority: www.ins246.com", | |||||
| "pragma: no-cache", | |||||
| "cache-control: no-cache", | |||||
| "accept: */*", | |||||
| "x-requested-with: XMLHttpRequest", | |||||
| "user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36", | |||||
| "content-type: application/x-www-form-urlencoded; charset=UTF-8", | |||||
| "origin: https://www.ins246.com", | |||||
| "sec-fetch-site: same-origin", | |||||
| "sec-fetch-mode: cors", | |||||
| "sec-fetch-dest: empty", | |||||
| "referer: https://www.ins246.com/ins/gosearch.html", | |||||
| "accept-language: en-US,en;q=0.9,zh-TW;q=0.8,zh;q=0.7,ja;q=0.6,zh-CN;q=0.5", | |||||
| "cookie: __cfduid=d94e7974e4bbec11a34d72efca94204591604573518; JSESSIONID=EF65EB47872FB6EFEFADA05C8CD140EA; _ga=GA1.2.343334980.1604573522; _gid=GA1.2.1674010257.1604573522; JSESSIONID=6A642D962A119268816A05B9B3D7C8A0" | |||||
| ), | |||||
| )); | |||||
| $response = curl_exec($curl); | |||||
| curl_close($curl); | |||||
| $formatResponse = json_decode($response, true); | |||||
| if ($formatResponse["code"] == 200) { | |||||
| return $formatResponse["data"]; | |||||
| } | |||||
| // echo $response; | |||||
| } | |||||
| function downloadFile($filenameUrl, $flag = 0, $filePrefix = "") | |||||
| { | |||||
| //echo $filenameUrl;exit; | |||||
| $filePathInfo = pathinfo($filenameUrl); | |||||
| $filename = $filePathInfo['filename']; | |||||
| // echo $filename; | |||||
| // echo "\n"; | |||||
| $pos = strpos($filename, "?"); | |||||
| // echo "pos: ".$pos; | |||||
| // echo "\n"; | |||||
| if ($pos > 0) { | |||||
| $filename = substr($filename, 0, $pos); | |||||
| } | |||||
| // echo $filename;exit; | |||||
| // if ($filename == "33020038_640464766303508_27725890796388352_n.jpg"){ | |||||
| // $flag = 1; | |||||
| // } | |||||
| if (file_exists($filePrefix . $filename) || file_exists($filePrefix . $filename . ".back")) { | |||||
| echo "\n file exists " . $filePrefix . $filename; | |||||
| return 0; | |||||
| } | |||||
| try { | |||||
| $cn_match = "https://www.ins246.com/ins/gosearch.html"; | |||||
| $options = array( | |||||
| 'ssl' => array( | |||||
| 'verify_peer' => true, | |||||
| 'cafile' => "/Users/shixuesen/Downloads/cacert.pem", | |||||
| 'ciphers' => 'HIGH:TLSv1.2:TLSv1.1:TLSv1.0:!SSLv3:!SSLv2', | |||||
| 'CN_match' => $cn_match, | |||||
| 'disable_compression' => true, | |||||
| ) | |||||
| ); | |||||
| $context = stream_context_create($options); | |||||
| $ch = curl_init(); | |||||
| curl_setopt($ch, CURLOPT_URL, $filenameUrl); | |||||
| curl_setopt($ch, CURLOPT_VERBOSE, 1); | |||||
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |||||
| curl_setopt($ch, CURLOPT_AUTOREFERER, false); | |||||
| curl_setopt($ch, CURLOPT_REFERER, "https://www.ins246.com/ins/gosearch.html"); | |||||
| curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); | |||||
| curl_setopt($ch, CURLOPT_HEADER, 0); | |||||
| $image = curl_exec($ch); | |||||
| curl_close($ch); | |||||
| // $image = file_get_contents($filenameUrl, false, null); | |||||
| } catch (\Throwable $e) { | |||||
| var_dump($e->getMessage()); | |||||
| $this->logFailUrl($filePrefix, $filenameUrl); | |||||
| return -1; | |||||
| } | |||||
| echo "new filename: " . $filePrefix . $filename . "\n"; | |||||
| $downloadResult = file_put_contents($filePrefix . $filename, $image); | |||||
| if (!$downloadResult) { | |||||
| $this->logFailUrl($filePrefix, $filenameUrl); | |||||
| return -1; | |||||
| } else { | |||||
| return 1; | |||||
| } | |||||
| } | |||||
| } | |||||