Crawler.php 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. <?php
  2. namespace wx\offiaccount;
  3. use OSS\OssClient;
  4. /**
  5. * 微信公众号文章爬取类
  6. * 使用方法:
  7. * $crawler = new WxCrawler();
  8. * $content = $crawler->crawByUrl($url);
  9. */
  10. class Crawler
  11. {
  12. /** @var 代理 */
  13. protected $agent = [
  14. "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
  15. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
  16. "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
  17. "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
  18. "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
  19. "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
  20. "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
  21. "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
  22. "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
  23. "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
  24. "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
  25. "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
  26. "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
  27. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
  28. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
  29. "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
  30. ];
  31. public $host = '';
  32. public $header = '';
  33. public $referer = '';
  34. public $antiLeech = '';
  35. public function __construct($host = '', $referer = '', $proxy = false)
  36. {
  37. /** @var 初始化curl信息 */
  38. $this->header = $this->agent[rand(0, count($this->agent) - 1)];
  39. $this->referer = empty($referer) ? 'http://weixin.sogou.com/' : $referer;
  40. $this->host = empty($host) ? 'weixin.sogou.com' : $host;
  41. /** @var 处理微信图片的防盗链 */
  42. $this->antiLeech = url('index/wx/url', [], true, true) . '?url=';
  43. }
  44. /**
  45. * 爬取内容
  46. * @author bignerd
  47. * @since 2016-08-16T10:13:58+0800
  48. * @param $url
  49. */
  50. public function _get($url)
  51. {
  52. // $ch=curl_init($url);
  53. // $options = [
  54. // CURLOPT_USERAGENT => $this->agent,
  55. // CURLOPT_REFERER => $this->referer,
  56. // ];
  57. // curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
  58. // curl_setopt($ch,CURLOPT_BINARYTRANSFER,true);
  59. // curl_setopt($ch,CURLOPT_TIMEOUT,60);
  60. // $output=curl_exec($ch);
  61. // return $output;
  62. $html = file_get_contents($url);
  63. return $html;
  64. }
  65. public function crawByUrl($url)
  66. {
  67. $content = $this->_get($url);
  68. $basicInfo = $this->articleBasicInfo($content);
  69. list($content_html, $content_text, $content_css) = $this->contentHandle($content);
  70. return array_merge($basicInfo, ['content_html' => $content_html, 'content_text' => $content_text, 'content_css' => $content_css]);
  71. }
  72. /**
  73. * 处理微信文章源码,提取文章主体,处理图片链接
  74. * @author bignerd
  75. * @since 2016-08-16T15:59:27+0800
  76. * @param $content 抓取的微信文章源码
  77. * @return [带图html文本,无图html文本]
  78. */
  79. public function contentHandle($content)
  80. {
  81. $content_html_pattern = '/<div class="rich_media_content " id="js_content"[^>]*>(.*?)<\/div>/s';
  82. preg_match_all($content_html_pattern, $content, $html_matchs);
  83. $content_html = $html_matchs[0][0];
  84. // 将隐藏的section显示出来
  85. $content_html = str_replace('visibility: hidden;', 'visibility: visible;', $content_html);
  86. $accessKeyId = config('app.ali_oss_access_key_id');
  87. $accessKeySecret = config('app.ali_oss_access_key_secret');
  88. $endpoint = config('app.ali_oss_end_point');
  89. $bucket = config('app.ali_oss_bucket');
  90. $bucketUrl = config('app.ali_oss_bindurl');
  91. $oss = new OssClient($accessKeyId, $accessKeySecret, $endpoint);
  92. /** 将视频中的地址进行替换 */
  93. $content_html = preg_replace_callback('/<iframe ([^>]*) data-src="(.*?)"/', function ($matches) use ($oss, $bucket, $bucketUrl) {
  94. // 将视频中内容上传到oss中
  95. $videourl = $matches[2];
  96. $query = parse_url(htmlspecialchars_decode($videourl), PHP_URL_QUERY);
  97. parse_str($query, $urlParam);
  98. if (isset($urlParam['vid'])) {
  99. $data = file_get_contents('https://mp.weixin.qq.com/mp/videoplayer?action=get_mp_video_play_url&vid=' . $urlParam['vid']);
  100. $data = json_decode($data, true);
  101. $videourl = $data['url_info'][0]['url'];
  102. }
  103. if (isset($urlParam['wx_fmt'])) {
  104. $ext = $urlParam['wx_fmt'];
  105. } else {
  106. $ext = parse_url($videourl, PHP_URL_PATH);
  107. $ext = pathinfo($ext, PATHINFO_EXTENSION);
  108. }
  109. if (empty($ext)) {
  110. trace('视频抓取失败:' . $videourl, 'error');
  111. return '<iframe ';
  112. }
  113. $content = file_get_contents($videourl);
  114. $path = 'wxcontent' . DIRECTORY_SEPARATOR . date('Ymd') . DIRECTORY_SEPARATOR . time() . rand(1000, 9999) . '.' . $ext;
  115. $oss->putObject($bucket, $path, $content);
  116. return '<iframe '.$matches[1].'src="' . 'https://' . $bucketUrl . '/' . $path . '"';
  117. }, $content_html);
  118. /** @var 带图片html文本 */
  119. $content_html = preg_replace_callback('/data-src="(.*?)"/', function ($matches) use ($oss, $bucket, $bucketUrl) {
  120. // 将图片中内容上传到oss中
  121. $ext = pathinfo($matches[1], PATHINFO_EXTENSION);
  122. if (empty($ext)) {
  123. $query = parse_url($matches[1], PHP_URL_QUERY);
  124. parse_str($query, $urlParam);
  125. if (isset($urlParam['wx_fmt'])) {
  126. $ext = $urlParam['wx_fmt'];
  127. }
  128. }
  129. $content = file_get_contents($matches[1]);
  130. $path = 'wxcontent' . DIRECTORY_SEPARATOR . date('Ymd') . DIRECTORY_SEPARATOR . time() . rand(1000, 9999) . '.' . $ext;
  131. $oss->putObject($bucket, $path, $content);
  132. return 'src="' . 'https://' . $bucketUrl . '/' . $path . '"';
  133. }, $content_html);
  134. /** @var 无图html文本 */
  135. $content_text = preg_replace('/<img.*?>/s', '', $content_html);
  136. /** @var css样式 */
  137. $content_css_pattern = '/<style[^>]*>(.*?)<\/style>/s';
  138. preg_match_all($content_css_pattern, $content, $css_matchs);
  139. return [$content_html, $content_text, $css_matchs[0][0]];
  140. }
  141. /**
  142. * 获取文章的基本信息
  143. * @author bignerd
  144. * @since 2016-08-16T17:16:32+0800
  145. * @param $content 文章详情源码
  146. * @return array $basicInfo
  147. */
  148. public function articleBasicInfo($content)
  149. {
  150. //待获取item
  151. $item = [
  152. 'ct' => 'date', //发布时间
  153. 'msg_title' => 'title', //标题
  154. 'msg_desc' => 'digest', //描述
  155. 'msg_link' => 'content_url', //文章链接
  156. 'cdn_url_1_1' => 'cover', //封面图片链接
  157. 'nickname' => 'wechatname', //公众号名称
  158. ];
  159. $basicInfo = [
  160. // 'author' => '',
  161. // 'copyright_stat' => '',
  162. ];
  163. foreach ($item as $k => $v) {
  164. $pattern = '/ var\s*' . $k . '\s*=\s*[\'|"](.*?)[\'|"][\.|;]/s';
  165. preg_match_all($pattern, $content, $matches);
  166. if (array_key_exists(1, $matches) && !empty($matches[1][0])) {
  167. $basicInfo[$v] = $this->htmlTransform($matches[1][0]);
  168. } else {
  169. $basicInfo[$v] = '';
  170. }
  171. }
  172. /** 获取作者 */
  173. preg_match('/<em class="rich_media_meta rich_media_meta_text">(.*?)<\/em>/s', $content, $matchAuthor);
  174. if (!empty($matchAuthor[1])) $basicInfo['author'] = $matchAuthor[1];
  175. /** 文章类型 */
  176. preg_match('/<span id="copyright_logo" class="rich_media_meta meta_original_tag">(.*?)<\/span>/s', $content, $matchType);
  177. if (!empty($matchType[1])) $basicInfo['copyright_stat'] = $matchType[1];
  178. return $basicInfo;
  179. }
  180. /**
  181. * 特殊字符转换
  182. * @author bignerd
  183. * @since 2016-08-16T17:30:52+0800
  184. * @param $string
  185. * @return $string
  186. */
  187. public function htmlTransform($string)
  188. {
  189. $string = str_replace('&quot;', '"', $string);
  190. $string = str_replace('&amp;', '&', $string);
  191. $string = str_replace('amp;', '', $string);
  192. $string = str_replace('&lt;', '<', $string);
  193. $string = str_replace('&gt;', '>', $string);
  194. $string = str_replace('&nbsp;', ' ', $string);
  195. $string = str_replace("\\", '', $string);
  196. return $string;
  197. }
  198. }