crawByUrl($url); */ class Crawler { /** @var 代理 */ protected $agent = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ]; public $host = ''; public $header = ''; public $referer = ''; public $antiLeech = ''; public function __construct($host = '', $referer = '', $proxy = false) { /** @var 初始化curl信息 */ $this->header = $this->agent[rand(0, count($this->agent) - 1)]; $this->referer = empty($referer) ? 'http://weixin.sogou.com/' : $referer; $this->host = empty($host) ? 'weixin.sogou.com' : $host; /** @var 处理微信图片的防盗链 */ $this->antiLeech = url('index/wx/url', [], true, true) . '?url='; } /** * 爬取内容 * @author bignerd * @since 2016-08-16T10:13:58+0800 * @param $url */ public function _get($url) { // $ch=curl_init($url); // $options = [ // CURLOPT_USERAGENT => $this->agent, // CURLOPT_REFERER => $this->referer, // ]; // curl_setopt($ch,CURLOPT_RETURNTRANSFER,true); // curl_setopt($ch,CURLOPT_BINARYTRANSFER,true); // curl_setopt($ch,CURLOPT_TIMEOUT,60); // $output=curl_exec($ch); // return $output; $html = file_get_contents($url); return $html; } public function crawByUrl($url) { $content = $this->_get($url); $basicInfo = $this->articleBasicInfo($content); list($content_html, $content_text, $content_css) = $this->contentHandle($content); return array_merge($basicInfo, ['content_html' => $content_html, 'content_text' => $content_text, 'content_css' => $content_css]); } /** * 处理微信文章源码,提取文章主体,处理图片链接 * @author bignerd * @since 2016-08-16T15:59:27+0800 * @param $content 抓取的微信文章源码 * @return [带图html文本,无图html文本] */ public function contentHandle($content) { $content_html_pattern = '/