QueryList/src/QueryList.php

216 lines
5.9 KiB
PHP
Raw Normal View History

2017-09-19 02:33:38 +08:00
<?php
namespace QL;
use phpQuery;
2017-09-19 17:48:48 +08:00
use QL\Dom\Dom;
2017-09-19 02:33:38 +08:00
/**
* QueryList
*
* 一个基于phpQuery的通用列表采集类
*
* @author Jaeger
* @email JaegerCode@gmail.com
* @link https://github.com/jae-jae/QueryList
* @version 4.0.0
*
*/
class QueryList
{
2017-09-19 19:06:16 +08:00
protected $html;
protected $document;
protected $rules;
protected $range = null;
2017-09-21 01:44:03 +08:00
protected $encoder;
2017-09-19 02:33:38 +08:00
/**
* QueryList constructor.
*/
public function __construct()
{
}
/**
* @return mixed
*/
public function getHtml()
{
return $this->html;
}
/**
2017-09-19 17:48:48 +08:00
* @param $html
* @return $this
2017-09-19 02:33:38 +08:00
*/
public function setHtml($html)
{
$this->html = $html;
$this->document = phpQuery::newDocumentHTML($this->html);
return $this;
}
public function find($selector)
{
2017-09-19 17:48:48 +08:00
return (new Dom($this->document))->find($selector);
2017-09-19 02:33:38 +08:00
}
2017-09-19 19:06:16 +08:00
public function rules(array $rules)
{
$this->rules = $rules;
return $this;
}
public function range($range)
{
$this->range = $range;
return $this;
}
2017-09-20 01:12:54 +08:00
public function removeHead()
2017-09-19 19:06:16 +08:00
{
2017-09-20 01:12:54 +08:00
$html = preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$this->html);
$this->setHtml($html);
2017-09-19 19:06:16 +08:00
return $this;
}
public function query($callback = null)
{
2017-09-20 01:12:54 +08:00
$data = $this->getList();
2017-09-19 19:06:16 +08:00
return is_null($callback)?$data:$data->map($callback);
}
2017-09-20 01:12:54 +08:00
protected function getList()
2017-09-19 19:06:16 +08:00
{
$data = [];
$document = $this->document;
if (!empty($this->range)) {
$robj = pq($document)->find($this->range);
$i = 0;
foreach ($robj as $item) {
2017-09-21 01:44:03 +08:00
foreach ($this->rules as $key => $reg_value){
$tags = $reg_value[2] ?? '';
2017-09-19 19:06:16 +08:00
$iobj = pq($item)->find($reg_value[0]);
switch ($reg_value[1]) {
case 'text':
2017-09-20 01:12:54 +08:00
$data[$i][$key] = $this->allowTags(pq($iobj)->html(),$tags);
2017-09-19 19:06:16 +08:00
break;
case 'html':
2017-09-20 01:12:54 +08:00
$data[$i][$key] = $this->stripTags(pq($iobj)->html(),$tags);
2017-09-19 19:06:16 +08:00
break;
default:
$data[$i][$key] = pq($iobj)->attr($reg_value[1]);
break;
}
if(isset($reg_value[3])){
$data[$i][$key] = call_user_func($reg_value[3],$data[$i][$key],$key);
}
}
$i++;
}
} else {
2017-09-21 01:44:03 +08:00
foreach ($this->rules as $key => $reg_value){
$tags = $reg_value[2] ?? '';
2017-09-19 19:06:16 +08:00
$lobj = pq($document)->find($reg_value[0]);
$i = 0;
foreach ($lobj as $item) {
switch ($reg_value[1]) {
case 'text':
2017-09-20 01:12:54 +08:00
$data[$i][$key] = $this->allowTags(pq($item)->html(),$tags);
2017-09-19 19:06:16 +08:00
break;
case 'html':
2017-09-20 01:12:54 +08:00
$data[$i][$key] = $this->stripTags(pq($item)->html(),$tags);
2017-09-19 19:06:16 +08:00
break;
default:
$data[$i][$key] = pq($item)->attr($reg_value[1]);
break;
}
if(isset($reg_value[3])){
$data[$i][$key] = call_user_func($reg_value[3],$data[$i][$key],$key);
}
2017-09-19 02:33:38 +08:00
2017-09-19 19:06:16 +08:00
$i++;
}
}
}
phpQuery::$documents = array();
return collect($data);
}
/**
* 去除特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
2017-09-20 01:12:54 +08:00
protected function stripTags($html,$tags_str)
2017-09-19 19:06:16 +08:00
{
2017-09-20 01:12:54 +08:00
$tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html,$tagsArr[1]);
2017-09-19 19:06:16 +08:00
$p = array();
foreach ($tagsArr[0] as $tag) {
$p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i";
}
$html = preg_replace($p,"",trim($html));
return $html;
}
/**
* 保留特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
2017-09-20 01:12:54 +08:00
protected function allowTags($html,$tags_str)
2017-09-19 19:06:16 +08:00
{
2017-09-20 01:12:54 +08:00
$tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html,$tagsArr[1]);
2017-09-19 19:06:16 +08:00
$allow = '';
foreach ($tagsArr[0] as $tag) {
$allow .= "<$tag> ";
}
return strip_tags(trim($html),$allow);
}
2017-09-20 01:12:54 +08:00
protected function tag($tags_str)
2017-09-19 19:06:16 +08:00
{
$tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY);
$tags = array(array(),array());
foreach($tagArr as $tag)
{
if(preg_match('/-(.+)/', $tag,$arr))
{
array_push($tags[1], $arr[1]);
}else{
array_push($tags[0], $tag);
}
}
return $tags;
}
/**
* 移除特定的html标签
* @param string $html
* @param array $tags 标签数组
* @return string
*/
2017-09-20 01:12:54 +08:00
protected function removeTags($html,$tags)
2017-09-19 19:06:16 +08:00
{
$tag_str = '';
if(count($tags))
{
foreach ($tags as $tag) {
$tag_str .= $tag_str?','.$tag:$tag;
}
phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
$doc = phpQuery::newDocumentHTML($html);
pq($doc)->find($tag_str)->remove();
$html = pq($doc)->htmlOuter();
$doc->unloadDocument();
}
return $html;
}
2017-09-19 02:33:38 +08:00
}