123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208 |
- <?php
- namespace wx\offiaccount;
- use OSS\OssClient;
- /**
- * 微信公众号文章爬取类
- * 使用方法:
- * $crawler = new WxCrawler();
- * $content = $crawler->crawByUrl($url);
- */
- class Crawler
- {
- /** @var 代理 */
- protected $agent = [
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
- "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
- "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
- "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
- "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
- "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
- "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
- "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
- "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
- ];
- public $host = '';
- public $header = '';
- public $referer = '';
- public $antiLeech = '';
- public function __construct($host = '', $referer = '', $proxy = false)
- {
- /** @var 初始化curl信息 */
- $this->header = $this->agent[rand(0, count($this->agent) - 1)];
- $this->referer = empty($referer) ? 'http://weixin.sogou.com/' : $referer;
- $this->host = empty($host) ? 'weixin.sogou.com' : $host;
- /** @var 处理微信图片的防盗链 */
- $this->antiLeech = url('index/wx/url', [], true, true) . '?url=';
- }
- /**
- * 爬取内容
- * @author bignerd
- * @since 2016-08-16T10:13:58+0800
- * @param $url
- */
- public function _get($url)
- {
- // $ch=curl_init($url);
- // $options = [
- // CURLOPT_USERAGENT => $this->agent,
- // CURLOPT_REFERER => $this->referer,
- // ];
- // curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
- // curl_setopt($ch,CURLOPT_BINARYTRANSFER,true);
- // curl_setopt($ch,CURLOPT_TIMEOUT,60);
- // $output=curl_exec($ch);
- // return $output;
- $html = file_get_contents($url);
- return $html;
- }
- public function crawByUrl($url)
- {
- $content = $this->_get($url);
- $basicInfo = $this->articleBasicInfo($content);
- list($content_html, $content_text, $content_css) = $this->contentHandle($content);
- return array_merge($basicInfo, ['content_html' => $content_html, 'content_text' => $content_text, 'content_css' => $content_css]);
- }
- /**
- * 处理微信文章源码,提取文章主体,处理图片链接
- * @author bignerd
- * @since 2016-08-16T15:59:27+0800
- * @param $content 抓取的微信文章源码
- * @return [带图html文本,无图html文本]
- */
- public function contentHandle($content)
- {
- $content_html_pattern = '/<div class="rich_media_content " id="js_content"[^>]*>(.*?)<\/div>/s';
- preg_match_all($content_html_pattern, $content, $html_matchs);
- $content_html = $html_matchs[0][0];
- // 将隐藏的section显示出来
- $content_html = str_replace('visibility: hidden;', 'visibility: visible;', $content_html);
- $accessKeyId = config('app.ali_oss_access_key_id');
- $accessKeySecret = config('app.ali_oss_access_key_secret');
- $endpoint = config('app.ali_oss_end_point');
- $bucket = config('app.ali_oss_bucket');
- $bucketUrl = config('app.ali_oss_bindurl');
- $oss = new OssClient($accessKeyId, $accessKeySecret, $endpoint);
- /** 将视频中的地址进行替换 */
- $content_html = preg_replace_callback('/<iframe ([^>]*) data-src="(.*?)"/', function ($matches) use ($oss, $bucket, $bucketUrl) {
- // 将视频中内容上传到oss中
- $videourl = $matches[2];
- $query = parse_url(htmlspecialchars_decode($videourl), PHP_URL_QUERY);
- parse_str($query, $urlParam);
- if (isset($urlParam['vid'])) {
- $data = file_get_contents('https://mp.weixin.qq.com/mp/videoplayer?action=get_mp_video_play_url&vid=' . $urlParam['vid']);
- $data = json_decode($data, true);
- $videourl = $data['url_info'][0]['url'];
- }
- if (isset($urlParam['wx_fmt'])) {
- $ext = $urlParam['wx_fmt'];
- } else {
- $ext = parse_url($videourl, PHP_URL_PATH);
- $ext = pathinfo($ext, PATHINFO_EXTENSION);
- }
- if (empty($ext)) {
- trace('视频抓取失败:' . $videourl, 'error');
- return '<iframe ';
- }
- $content = file_get_contents($videourl);
- $path = 'wxcontent' . DIRECTORY_SEPARATOR . date('Ymd') . DIRECTORY_SEPARATOR . time() . rand(1000, 9999) . '.' . $ext;
- $oss->putObject($bucket, $path, $content);
- return '<iframe '.$matches[1].'src="' . 'https://' . $bucketUrl . '/' . $path . '"';
- }, $content_html);
- /** @var 带图片html文本 */
- $content_html = preg_replace_callback('/data-src="(.*?)"/', function ($matches) use ($oss, $bucket, $bucketUrl) {
- // 将图片中内容上传到oss中
- $ext = pathinfo($matches[1], PATHINFO_EXTENSION);
- if (empty($ext)) {
- $query = parse_url($matches[1], PHP_URL_QUERY);
- parse_str($query, $urlParam);
- if (isset($urlParam['wx_fmt'])) {
- $ext = $urlParam['wx_fmt'];
- }
- }
- $content = file_get_contents($matches[1]);
- $path = 'wxcontent' . DIRECTORY_SEPARATOR . date('Ymd') . DIRECTORY_SEPARATOR . time() . rand(1000, 9999) . '.' . $ext;
- $oss->putObject($bucket, $path, $content);
- return 'src="' . 'https://' . $bucketUrl . '/' . $path . '"';
- }, $content_html);
- /** @var 无图html文本 */
- $content_text = preg_replace('/<img.*?>/s', '', $content_html);
- /** @var css样式 */
- $content_css_pattern = '/<style[^>]*>(.*?)<\/style>/s';
- preg_match_all($content_css_pattern, $content, $css_matchs);
- return [$content_html, $content_text, $css_matchs[0][0]];
- }
- /**
- * 获取文章的基本信息
- * @author bignerd
- * @since 2016-08-16T17:16:32+0800
- * @param $content 文章详情源码
- * @return array $basicInfo
- */
- public function articleBasicInfo($content)
- {
- //待获取item
- $item = [
- 'ct' => 'date', //发布时间
- 'msg_title' => 'title', //标题
- 'msg_desc' => 'digest', //描述
- 'msg_link' => 'content_url', //文章链接
- 'cdn_url_1_1' => 'cover', //封面图片链接
- 'nickname' => 'wechatname', //公众号名称
- ];
- $basicInfo = [
- // 'author' => '',
- // 'copyright_stat' => '',
- ];
- foreach ($item as $k => $v) {
- $pattern = '/ var\s*' . $k . '\s*=\s*[\'|"](.*?)[\'|"][\.|;]/s';
- preg_match_all($pattern, $content, $matches);
- if (array_key_exists(1, $matches) && !empty($matches[1][0])) {
- $basicInfo[$v] = $this->htmlTransform($matches[1][0]);
- } else {
- $basicInfo[$v] = '';
- }
- }
- /** 获取作者 */
- preg_match('/<em class="rich_media_meta rich_media_meta_text">(.*?)<\/em>/s', $content, $matchAuthor);
- if (!empty($matchAuthor[1])) $basicInfo['author'] = $matchAuthor[1];
- /** 文章类型 */
- preg_match('/<span id="copyright_logo" class="rich_media_meta meta_original_tag">(.*?)<\/span>/s', $content, $matchType);
- if (!empty($matchType[1])) $basicInfo['copyright_stat'] = $matchType[1];
- return $basicInfo;
- }
- /**
- * 特殊字符转换
- * @author bignerd
- * @since 2016-08-16T17:30:52+0800
- * @param $string
- * @return $string
- */
- public function htmlTransform($string)
- {
- $string = str_replace('"', '"', $string);
- $string = str_replace('&', '&', $string);
- $string = str_replace('amp;', '', $string);
- $string = str_replace('<', '<', $string);
- $string = str_replace('>', '>', $string);
- $string = str_replace(' ', ' ', $string);
- $string = str_replace("\\", '', $string);
- return $string;
- }
- }
|