scott
/
crm


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
							<?php

namespace wx\offiaccount;

use OSS\OssClient;

/**
 * 微信公众号文章爬取类
 * 使用方法：
 * $crawler = new WxCrawler();
 * $content = $crawler->crawByUrl($url);
 */
class Crawler
{
    /** @var 代理  */
    protected $agent = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    ];
    public $host = '';
    public $header = '';
    public $referer = '';
    public $antiLeech = '';
    public function __construct($host = '', $referer = '', $proxy = false)
    {
        /** @var 初始化curl信息  */
        $this->header  = $this->agent[rand(0, count($this->agent) - 1)];
        $this->referer = empty($referer) ? 'http://weixin.sogou.com/' : $referer;
        $this->host    = empty($host) ? 'weixin.sogou.com' : $host;
        /** @var 处理微信图片的防盗链 */
        $this->antiLeech = url('index/wx/url', [], true, true) . '?url=';
    }
    /**
     * 爬取内容
     * @author bignerd
     * @since  2016-08-16T10:13:58+0800
     * @param  $url
     */
    public function _get($url)
    {
        // $ch=curl_init($url);
        // $options = [
        //   CURLOPT_USERAGENT => $this->agent,
        //   CURLOPT_REFERER => $this->referer,
        // ];
        // curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
        //     curl_setopt($ch,CURLOPT_BINARYTRANSFER,true);
        //     curl_setopt($ch,CURLOPT_TIMEOUT,60);
        //     $output=curl_exec($ch);
        //     return $output;
        $html = file_get_contents($url);
        return $html;
    }
    public function crawByUrl($url)
    {
        $content = $this->_get($url);
        $basicInfo = $this->articleBasicInfo($content);
        list($content_html, $content_text, $content_css) = $this->contentHandle($content);
        return array_merge($basicInfo, ['content_html' => $content_html, 'content_text' => $content_text, 'content_css' => $content_css]);
    }
    /**
     * 处理微信文章源码，提取文章主体，处理图片链接
     * @author bignerd
     * @since  2016-08-16T15:59:27+0800
     * @param  $content 抓取的微信文章源码
     * @return [带图html文本，无图html文本]
     */
    public function contentHandle($content)
    {
        $content_html_pattern = '/<div class="rich_media_content " id="js_content"[^>]*>(.*?)<\/div>/s';
        preg_match_all($content_html_pattern, $content, $html_matchs);

        $content_html = $html_matchs[0][0];
        // 将隐藏的section显示出来
        $content_html = str_replace('visibility: hidden;', 'visibility: visible;', $content_html);

        $accessKeyId = config('app.ali_oss_access_key_id');
        $accessKeySecret = config('app.ali_oss_access_key_secret');
        $endpoint = config('app.ali_oss_end_point');
        $bucket = config('app.ali_oss_bucket');
        $bucketUrl = config('app.ali_oss_bindurl');
        $oss = new OssClient($accessKeyId, $accessKeySecret, $endpoint);

        /** 将视频中的地址进行替换 */
        $content_html = preg_replace_callback('/<iframe ([^>]*) data-src="(.*?)"/', function ($matches) use ($oss, $bucket, $bucketUrl) {
            // 将视频中内容上传到oss中
            $videourl = $matches[2];

            $query = parse_url(htmlspecialchars_decode($videourl), PHP_URL_QUERY);
            parse_str($query, $urlParam);
            if (isset($urlParam['vid'])) {
                $data = file_get_contents('https://mp.weixin.qq.com/mp/videoplayer?action=get_mp_video_play_url&vid=' . $urlParam['vid']);
                $data = json_decode($data, true);
                $videourl = $data['url_info'][0]['url'];
            }
            if (isset($urlParam['wx_fmt'])) {
                $ext = $urlParam['wx_fmt'];
            } else {
                $ext = parse_url($videourl, PHP_URL_PATH);
                $ext = pathinfo($ext, PATHINFO_EXTENSION);
            }

            if (empty($ext)) {
                trace('视频抓取失败：' . $videourl, 'error');
                return '<iframe ';
            }
            $content = file_get_contents($videourl);

            $path = 'wxcontent' . DIRECTORY_SEPARATOR . date('Ymd') . DIRECTORY_SEPARATOR . time() . rand(1000, 9999) . '.' . $ext;
            $oss->putObject($bucket, $path, $content);
            return '<iframe '.$matches[1].'src="' . 'https://' . $bucketUrl . '/' . $path . '"';
        }, $content_html);
        /** @var  带图片html文本 */
        $content_html = preg_replace_callback('/data-src="(.*?)"/', function ($matches) use ($oss, $bucket, $bucketUrl) {
            // 将图片中内容上传到oss中
            $ext = pathinfo($matches[1], PATHINFO_EXTENSION);
            if (empty($ext)) {
                $query = parse_url($matches[1], PHP_URL_QUERY);
                parse_str($query, $urlParam);
                if (isset($urlParam['wx_fmt'])) {
                    $ext = $urlParam['wx_fmt'];
                }
            }
            $content = file_get_contents($matches[1]);
            $path = 'wxcontent' . DIRECTORY_SEPARATOR . date('Ymd') . DIRECTORY_SEPARATOR . time() . rand(1000, 9999) . '.' . $ext;
            $oss->putObject($bucket, $path, $content);
            return 'src="' . 'https://' . $bucketUrl . '/' . $path . '"';
        }, $content_html);
        /** @var  无图html文本 */
        $content_text = preg_replace('/<img.*?>/s', '', $content_html);
        /** @var css样式 */
        $content_css_pattern = '/<style[^>]*>(.*?)<\/style>/s';
        preg_match_all($content_css_pattern, $content, $css_matchs);

        return [$content_html, $content_text, $css_matchs[0][0]];
    }
    /**
     * 获取文章的基本信息
     * @author bignerd
     * @since  2016-08-16T17:16:32+0800
     * @param  $content 文章详情源码
     * @return array $basicInfo
     */
    public function articleBasicInfo($content)
    {
        //待获取item                
        $item = [
            'ct' => 'date', //发布时间
            'msg_title' => 'title', //标题
            'msg_desc' => 'digest', //描述
            'msg_link' => 'content_url', //文章链接
            'cdn_url_1_1' => 'cover', //封面图片链接
            'nickname' => 'wechatname', //公众号名称
        ];
        $basicInfo = [
            // 'author' => '',
            // 'copyright_stat' => '',
        ];
        foreach ($item as $k => $v) {
            $pattern = '/ var\s*' . $k . '\s*=\s*[\'|"](.*?)[\'|"][\.|;]/s';
            preg_match_all($pattern, $content, $matches);
            if (array_key_exists(1, $matches) && !empty($matches[1][0])) {
                $basicInfo[$v] = $this->htmlTransform($matches[1][0]);
            } else {
                $basicInfo[$v] = '';
            }
        }
        /** 获取作者 */
        preg_match('/<em class="rich_media_meta rich_media_meta_text">(.*?)<\/em>/s', $content, $matchAuthor);
        if (!empty($matchAuthor[1])) $basicInfo['author'] = $matchAuthor[1];
        /** 文章类型 */
        preg_match('/<span id="copyright_logo" class="rich_media_meta meta_original_tag">(.*?)<\/span>/s', $content, $matchType);
        if (!empty($matchType[1])) $basicInfo['copyright_stat'] = $matchType[1];
        return $basicInfo;
    }
    /**
     * 特殊字符转换
     * @author bignerd
     * @since  2016-08-16T17:30:52+0800
     * @param  $string
     * @return $string
     */
    public function htmlTransform($string)
    {
        $string = str_replace('&quot;', '"', $string);
        $string = str_replace('&amp;', '&', $string);
        $string = str_replace('amp;', '', $string);
        $string = str_replace('&lt;', '<', $string);
        $string = str_replace('&gt;', '>', $string);
        $string = str_replace('&nbsp;', ' ', $string);
        $string = str_replace("\\", '', $string);
        return $string;
    }
}