files = array_merge($this->files, scanFilesWithoutPath($this->imageDir)); $this->files = array_merge($this->files, scanFilesWithoutPath($this->videoDir)); } public function scrapeWeiboPicAndVideo($content) { // if (!isset($_REQUEST['content'])) die('NoData'); // // $content = trim($_REQUEST['content']); // echo $content;exit; // @mkdir('data'); // @mkdir('data/Kendall'); // @mkdir('data/video'); // $video_dir = "/Volumes/intel660p/video/weibo/ruye"; // $image_dir = "/Volumes/intel660p/image/weibo/ruya"; // $video_dir = "/Volumes/Samsung/weibo/people/徐圣佑/video"; // $image_dir = "/Volumes/Samsung/weibo/people/徐圣佑/image"; $video_dir = "/Volumes/Crucial X6/Image/weibo/video/"; $image_dir = "/Volumes/Crucial X6/Image/weibo/image/"; try { if (strlen($content) > 0) { // $fname = 'data/' . microtime_float() . '.txt'; // file_put_contents( $fname , $content ); $decoded_json = json_decode($content, true); $cards = $decoded_json['data']['cards']; foreach ($cards as $card) { if (!array_key_exists("mblog", $card)) { continue; } $mblog = $card['mblog']; // $subdir = "data/video"; // if (!file_exists($subdir)) { // mkdir($subdir); // } echo json_encode($mblog); if (array_key_exists("pics", $mblog)) { $pics = $mblog['pics']; $user = Arr::get($mblog, "user", []); $text = Arr::get($mblog, "text", ''); var_dump($pics); self::process_pic($pics, $image_dir, $user, $text); // foreach ($pics as $pic) { // if (array_key_exists("large", $pic)) { // $pic_url = $pic['large']['url']; // $h2w = $pic['large']['geo']['height']/$pic['large']['geo']['width']; // } else { // $pic_url = $pic['url']; // $h2w = $pic['geo']['height'] / $pic['geo']['width']; // } // if ($h2w > 15) { // continue; // } // $pic_name = pathinfo($pic_url, PATHINFO_FILENAME); // $pic_ext = pathinfo($pic_url, PATHINFO_EXTENSION); // $file_name = $subdir . "/" . $pic_name . "." . $pic_ext; // if (!file_exists($file_name)) { // $pic_content = file_get_contents($pic_url); // // echo $pic_content; // file_put_contents($file_name, $pic_content); // } // } } else if (array_key_exists("retweeted_status", $mblog)) { if (array_key_exists("pics", $mblog["retweeted_status"])) { $pics = $mblog["retweeted_status"]['pics']; # code... $user = Arr::get($mblog["retweeted_status"], "user", []); $text = Arr::get($mblog["retweeted_status"], "text", ''); self::process_pic($pics, $image_dir, $user, $text); } # code... } else if (array_key_exists("page_info", $mblog) && array_key_exists("media_info", $mblog["page_info"])) { # code... $page_info = $mblog["page_info"]; $media_info = $mblog["page_info"]["media_info"]; $medis_urls = $mblog["page_info"]["urls"]; $video_url = Arr::get($medis_urls, "mp4_720p_mp4", ""); if ($video_url == "") { $video_url = Arr::get($medis_urls, "mp4_hd_url", ""); } if ($video_url == "") { $video_url = Arr::get($medis_urls, "mp4_ld_mp4", ""); } if ($video_url != "") { self::process_video($video_url, $video_dir, $page_info["content2"]); } } } // die('save to ' . $fname . ' url = ' . $_REQUEST['url'] ); } else { die('empty'); } } catch (\Exception $e) { Log::error($e); Log::error($e->getTraceAsString()); // Log::info($content); } return true; } public function scrapeGroupWeiboPicAndVideo($content) { // $video_dir = "/Volumes/Samsung/weibo/video"; // $image_dir = "/Volumes/Samsung/weibo/image"; $video_dir = "/Volumes/Crucial X6/Image/weibo/video/"; $image_dir = "/Volumes/Crucial X6/Image/weibo/image/"; try { if (strlen($content) > 0) { // $fname = 'data/' . microtime_float() . '.txt'; // file_put_contents( $fname , $content ); $decoded_json = json_decode($content, true); $cards = $decoded_json['data']['statuses']; foreach ($cards as $card) { // if (!array_key_exists("mblog", $card)) { // continue; // } $mblog = $card; $subdir = "data/video"; if (!file_exists($subdir)) { mkdir($subdir); } echo json_encode($mblog); if (array_key_exists("pics", $mblog)) { $pics = $mblog['pics']; $user = Arr::get($mblog, "user", []); $text = Arr::get($mblog, "text", ''); var_dump($pics); self::process_pic($pics, $image_dir, $user, $text); // foreach ($pics as $pic) { // if (array_key_exists("large", $pic)) { // $pic_url = $pic['large']['url']; // $h2w = $pic['large']['geo']['height']/$pic['large']['geo']['width']; // } else { // $pic_url = $pic['url']; // $h2w = $pic['geo']['height'] / $pic['geo']['width']; // } // if ($h2w > 15) { // continue; // } // $pic_name = pathinfo($pic_url, PATHINFO_FILENAME); // $pic_ext = pathinfo($pic_url, PATHINFO_EXTENSION); // $file_name = $subdir . "/" . $pic_name . "." . $pic_ext; // if (!file_exists($file_name)) { // $pic_content = file_get_contents($pic_url); // // echo $pic_content; // file_put_contents($file_name, $pic_content); // } // } } else if (array_key_exists("retweeted_status", $mblog)) { if (array_key_exists("pics", $mblog["retweeted_status"])) { $pics = $mblog["retweeted_status"]['pics']; # code... $user = Arr::get($mblog["retweeted_status"], "user", []); $text = Arr::get($mblog["retweeted_status"], "text", ''); self::process_pic($pics, $image_dir, $user, $text); } # code... } else if (array_key_exists("page_info", $mblog) && array_key_exists("media_info", $mblog["page_info"])) { # code... $page_info = $mblog["page_info"]; $media_info = $mblog["page_info"]["media_info"]; $video_url = Arr::get($media_info, "mp4_720p_mp4", ""); if ($video_url == "") { $video_url = Arr::get($media_info, "mp4_hd_url", ""); } if ($video_url == "") { $video_url = Arr::get($media_info, "mp4_sd_url", ""); } if ($video_url != "") { self::process_video($video_url, $video_dir, $page_info["content2"]); } } } // die('save to ' . $fname . ' url = ' . $_REQUEST['url'] ); } else { die('empty'); } } catch (\Exception $e) { Log::error($e); Log::error($e->getTraceAsString()); // Log::info($content); } return true; } function microtime_float() { list($usec, $sec) = explode(" ", microtime()); return ((float)$usec + (float)$sec); } function process_pic($pics, $subDir, $user, $text) { if (!file_exists($subDir)) { mkdir($subDir); } $h2w = 0; foreach ($pics as $pic) { if (array_key_exists("large", $pic)) { $pic_url = $pic['large']['url']; if (gettype($pic["large"]["geo"]) == "array") { $h2w = $pic['large']['geo']['height'] / $pic['large']['geo']['width']; } } else { $pic_url = $pic['url']; if (gettype($pic["geo"]) == "array") { $h2w = $pic['geo']['height'] / $pic['geo']['width']; } } if ($h2w > 15) { continue; } $picName = pathinfo($pic_url, PATHINFO_FILENAME); $picExt = pathinfo($pic_url, PATHINFO_EXTENSION); $user_name = Arr::get($user, "screen_name", ''); if ($user_name != '') { $picName = $user_name . '--' .$picName; } $file_name = $subDir . DIRECTORY_SEPARATOR . $picName . "." . $picExt; $baseName = $picName . "." . $picExt; if (!file_exists($file_name) && !$this->checkFileHasDownload($baseName)) { $pic_content = file_get_contents($pic_url); // echo $pic_content; file_put_contents($file_name, $pic_content); $this->files[] = $baseName; } else { Log::info("$baseName file exists"); } } } function process_video($video_url, $subdir, $video_name) { Log::info("video_url: " . $video_url); if (!file_exists($subdir)) { mkdir($subdir); } $url_params = parse_url($video_url); parse_str($url_params["query"], $params); $video_origin_name = $url_params["path"]; $video_origin_name = substr($video_origin_name, 1); $video_origin_name = str_replace("stream/", "--", $video_origin_name); $video_origin_name = str_replace("/", "", $video_origin_name); $video_name = preg_replace("/(http|https|ftp)(.)*([a-z0-9\-\.\_])+/i", "", $video_name); $now = strtotime(date("y-m-d h:i:s")); if ($now > $params['Expires']) { Log::error("视频有效期已过,now is " . $now .", Expires is ". $params['Expires']); abort(404); } // $video_origin_name = pathinfo($video_url, PATHINFO_FILENAME); // $video_ext = pathinfo($video_url, PATHINFO_EXTENSION); $file_name = $subdir . "/" . $video_name . "--" . $video_origin_name; $baseName = $video_name . "--" . $video_origin_name; if (!file_exists($file_name) && !$this->checkFileHasDownload($baseName)) { # code... $video_content = file_get_contents($video_url); file_put_contents($file_name, $video_content); $this->files[] = $baseName; } else { Log::info("$baseName file exists"); } } private function checkFileHasDownload($fileName) { if (in_array($fileName, $this->files)) { Log::info("$fileName exist in local files"); return true; } $record = ImageRecord::where("name", $fileName)->first(); if ($record != null) { Log::info("$fileName exist in db"); return true; } } }