2017-09-22 01:51:46 +08:00
|
|
|
<?php
|
|
|
|
/**
|
|
|
|
* Created by PhpStorm.
|
|
|
|
* User: Jaeger <JaegerCode@gmail.com>
|
|
|
|
* Date: 2017/9/21
|
|
|
|
*/
|
|
|
|
|
|
|
|
namespace QL\Dom;
|
|
|
|
|
2018-12-10 19:23:15 +08:00
|
|
|
use Tightenco\Collect\Support\Collection;
|
2017-09-22 01:51:46 +08:00
|
|
|
use phpQuery;
|
2020-03-13 13:49:36 +08:00
|
|
|
use phpQueryObject;
|
2017-09-22 01:51:46 +08:00
|
|
|
use QL\QueryList;
|
2017-09-24 15:11:44 +08:00
|
|
|
use Closure;
|
2017-09-22 01:51:46 +08:00
|
|
|
|
|
|
|
class Query
|
|
|
|
{
|
|
|
|
protected $html;
|
2020-03-05 22:18:54 +08:00
|
|
|
/**
|
|
|
|
* @var \phpQueryObject
|
|
|
|
*/
|
2017-09-22 01:51:46 +08:00
|
|
|
protected $document;
|
|
|
|
protected $rules;
|
|
|
|
protected $range = null;
|
|
|
|
protected $ql;
|
2017-09-24 15:11:44 +08:00
|
|
|
/**
|
|
|
|
* @var Collection
|
|
|
|
*/
|
|
|
|
protected $data;
|
2017-09-22 01:51:46 +08:00
|
|
|
|
|
|
|
|
|
|
|
public function __construct(QueryList $ql)
|
|
|
|
{
|
|
|
|
$this->ql = $ql;
|
|
|
|
}
|
|
|
|
|
2017-10-09 01:48:56 +08:00
|
|
|
/**
|
|
|
|
* @return mixed
|
|
|
|
*/
|
2017-09-22 01:51:46 +08:00
|
|
|
public function getHtml()
|
|
|
|
{
|
|
|
|
return $this->html;
|
|
|
|
}
|
|
|
|
|
2017-10-09 01:48:56 +08:00
|
|
|
/**
|
|
|
|
* @param $html
|
|
|
|
* @param null $charset
|
|
|
|
* @return QueryList
|
|
|
|
*/
|
2017-09-22 19:09:43 +08:00
|
|
|
public function setHtml($html, $charset = null)
|
2017-09-22 01:51:46 +08:00
|
|
|
{
|
2017-09-22 02:38:46 +08:00
|
|
|
$this->html = value($html);
|
2020-03-13 13:49:36 +08:00
|
|
|
$this->destroyDocument();
|
2017-09-22 19:09:43 +08:00
|
|
|
$this->document = phpQuery::newDocumentHTML($this->html,$charset);
|
2017-09-22 01:51:46 +08:00
|
|
|
return $this->ql;
|
|
|
|
}
|
|
|
|
|
2017-10-09 01:48:56 +08:00
|
|
|
/**
|
|
|
|
* Get crawl results
|
|
|
|
*
|
|
|
|
* @param Closure|null $callback
|
|
|
|
* @return Collection|static
|
|
|
|
*/
|
2017-09-24 15:11:44 +08:00
|
|
|
public function getData(Closure $callback = null)
|
|
|
|
{
|
|
|
|
return is_null($callback) ? $this->data : $this->data->map($callback);
|
|
|
|
}
|
|
|
|
|
2017-10-09 01:48:56 +08:00
|
|
|
/**
|
|
|
|
* @param Collection $data
|
|
|
|
*/
|
2017-09-25 14:15:26 +08:00
|
|
|
public function setData(Collection $data)
|
2017-09-24 15:11:44 +08:00
|
|
|
{
|
|
|
|
$this->data = $data;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-10-09 01:48:56 +08:00
|
|
|
/**
|
|
|
|
* Searches for all elements that match the specified expression.
|
|
|
|
*
|
|
|
|
* @param $selector A string containing a selector expression to match elements against.
|
|
|
|
* @return Elements
|
|
|
|
*/
|
2017-09-22 01:51:46 +08:00
|
|
|
public function find($selector)
|
|
|
|
{
|
|
|
|
return (new Dom($this->document))->find($selector);
|
|
|
|
}
|
|
|
|
|
2017-10-09 01:48:56 +08:00
|
|
|
/**
|
|
|
|
* Set crawl rule
|
|
|
|
*
|
|
|
|
* $rules = [
|
|
|
|
* 'rule_name1' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
|
|
|
|
* 'rule_name2' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
|
|
|
|
* // ...
|
|
|
|
* ]
|
|
|
|
*
|
|
|
|
* @param array $rules
|
|
|
|
* @return QueryList
|
|
|
|
*/
|
2017-09-22 01:51:46 +08:00
|
|
|
public function rules(array $rules)
|
|
|
|
{
|
|
|
|
$this->rules = $rules;
|
|
|
|
return $this->ql;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-10-09 01:48:56 +08:00
|
|
|
/**
|
|
|
|
* Set the slice area for crawl list
|
|
|
|
*
|
|
|
|
* @param $selector
|
|
|
|
* @return QueryList
|
|
|
|
*/
|
|
|
|
public function range($selector)
|
2017-09-22 01:51:46 +08:00
|
|
|
{
|
2017-10-09 01:48:56 +08:00
|
|
|
$this->range = $selector;
|
2017-09-22 01:51:46 +08:00
|
|
|
return $this->ql;
|
|
|
|
}
|
|
|
|
|
2017-10-09 01:48:56 +08:00
|
|
|
/**
|
|
|
|
* Remove HTML head,try to solve the garbled
|
|
|
|
*
|
|
|
|
* @return QueryList
|
|
|
|
*/
|
2017-09-22 01:51:46 +08:00
|
|
|
public function removeHead()
|
|
|
|
{
|
|
|
|
$html = preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$this->html);
|
|
|
|
$this->setHtml($html);
|
|
|
|
return $this->ql;
|
|
|
|
}
|
|
|
|
|
2017-10-09 01:48:56 +08:00
|
|
|
/**
|
|
|
|
* Execute the query rule
|
|
|
|
*
|
|
|
|
* @param Closure|null $callback
|
|
|
|
* @return QueryList
|
|
|
|
*/
|
2017-09-24 15:11:44 +08:00
|
|
|
public function query(Closure $callback = null)
|
2017-09-22 01:51:46 +08:00
|
|
|
{
|
2017-09-24 15:11:44 +08:00
|
|
|
$this->data = $this->getList();
|
|
|
|
$callback && $this->data = $this->data->map($callback);
|
2017-09-25 14:15:26 +08:00
|
|
|
return $this->ql;
|
2017-09-22 01:51:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
protected function getList()
|
|
|
|
{
|
|
|
|
$data = [];
|
|
|
|
if (!empty($this->range)) {
|
2020-03-13 21:39:42 +08:00
|
|
|
$rangeElements = $this->document->find($this->range);
|
2017-09-22 01:51:46 +08:00
|
|
|
$i = 0;
|
2020-03-13 21:39:42 +08:00
|
|
|
foreach ($rangeElements as $element) {
|
2017-09-22 01:51:46 +08:00
|
|
|
foreach ($this->rules as $key => $reg_value){
|
2020-03-13 21:39:42 +08:00
|
|
|
$rule = $this->parseRule($reg_value);
|
|
|
|
$contentElements = pq($element)->find($rule['selector']);
|
|
|
|
$data[$i][$key] = $this->extractContent($contentElements, $key, $rule);
|
2017-09-22 01:51:46 +08:00
|
|
|
}
|
|
|
|
$i++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
foreach ($this->rules as $key => $reg_value){
|
2020-03-13 21:39:42 +08:00
|
|
|
$rule = $this->parseRule($reg_value);
|
|
|
|
$contentElements = $this->document->find($rule['selector']);
|
2017-09-22 01:51:46 +08:00
|
|
|
$i = 0;
|
2020-03-13 21:39:42 +08:00
|
|
|
foreach ($contentElements as $element) {
|
|
|
|
$data[$i][$key] = $this->extractContent(pq($element), $key, $rule);
|
2017-09-22 01:51:46 +08:00
|
|
|
$i++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-03-13 21:39:42 +08:00
|
|
|
|
2017-09-22 01:51:46 +08:00
|
|
|
return collect($data);
|
|
|
|
}
|
|
|
|
|
2020-03-13 21:39:42 +08:00
|
|
|
protected function extractContent(phpQueryObject $pqObj, $ruleName, $rule)
|
|
|
|
{
|
|
|
|
switch ($rule['attr']) {
|
|
|
|
case 'text':
|
|
|
|
$content = $this->allowTags($pqObj->html(), $rule['filter_tags']);
|
|
|
|
break;
|
|
|
|
case 'texts':
|
|
|
|
$content = (new Elements($pqObj))->map(function(Elements $element) use($rule){
|
|
|
|
return $this->allowTags($element->html(), $rule['filter_tags']);
|
|
|
|
})->all();
|
|
|
|
break;
|
|
|
|
case 'html':
|
|
|
|
$content = $this->stripTags($pqObj->html(), $rule['filter_tags']);
|
|
|
|
break;
|
|
|
|
case 'htmls':
|
|
|
|
$content = (new Elements($pqObj))->map(function(Elements $element) use($rule){
|
|
|
|
return $this->stripTags($element->html(), $rule['filter_tags']);
|
|
|
|
})->all();
|
|
|
|
break;
|
|
|
|
case 'htmlOuter':
|
|
|
|
$content = $this->stripTags($pqObj->htmlOuter(), $rule['filter_tags']);
|
|
|
|
break;
|
|
|
|
case 'htmlOuters':
|
|
|
|
$content = (new Elements($pqObj))->map(function(Elements $element) use($rule){
|
|
|
|
return $this->stripTags($element->htmlOuter(), $rule['filter_tags']);
|
|
|
|
})->all();
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
$content = $pqObj->attr($rule['attr']);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-03-13 21:42:25 +08:00
|
|
|
if($rule['handle_callback']){
|
2020-03-13 21:39:42 +08:00
|
|
|
$content = call_user_func($rule['handle_callback'], $content, $ruleName);
|
|
|
|
}
|
|
|
|
|
|
|
|
return $content;
|
|
|
|
}
|
|
|
|
|
|
|
|
protected function parseRule($rule)
|
|
|
|
{
|
|
|
|
$result = [];
|
|
|
|
$result['selector'] = $rule[0];
|
|
|
|
$result['attr'] = $rule[1];
|
|
|
|
$result['filter_tags'] = $rule[2] ?? '';
|
|
|
|
$result['handle_callback'] = $rule[3] ?? null;
|
|
|
|
|
|
|
|
return $result;
|
|
|
|
}
|
|
|
|
|
2017-09-22 01:51:46 +08:00
|
|
|
/**
|
|
|
|
* 去除特定的html标签
|
|
|
|
* @param string $html
|
|
|
|
* @param string $tags_str 多个标签名之间用空格隔开
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
protected function stripTags($html,$tags_str)
|
|
|
|
{
|
|
|
|
$tagsArr = $this->tag($tags_str);
|
|
|
|
$html = $this->removeTags($html,$tagsArr[1]);
|
|
|
|
$p = array();
|
|
|
|
foreach ($tagsArr[0] as $tag) {
|
|
|
|
$p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i";
|
|
|
|
}
|
|
|
|
$html = preg_replace($p,"",trim($html));
|
|
|
|
return $html;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* 保留特定的html标签
|
|
|
|
* @param string $html
|
|
|
|
* @param string $tags_str 多个标签名之间用空格隔开
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
protected function allowTags($html,$tags_str)
|
|
|
|
{
|
|
|
|
$tagsArr = $this->tag($tags_str);
|
|
|
|
$html = $this->removeTags($html,$tagsArr[1]);
|
|
|
|
$allow = '';
|
|
|
|
foreach ($tagsArr[0] as $tag) {
|
|
|
|
$allow .= "<$tag> ";
|
|
|
|
}
|
|
|
|
return strip_tags(trim($html),$allow);
|
|
|
|
}
|
|
|
|
|
|
|
|
protected function tag($tags_str)
|
|
|
|
{
|
|
|
|
$tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY);
|
|
|
|
$tags = array(array(),array());
|
|
|
|
foreach($tagArr as $tag)
|
|
|
|
{
|
|
|
|
if(preg_match('/-(.+)/', $tag,$arr))
|
|
|
|
{
|
|
|
|
array_push($tags[1], $arr[1]);
|
|
|
|
}else{
|
|
|
|
array_push($tags[0], $tag);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $tags;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* 移除特定的html标签
|
|
|
|
* @param string $html
|
|
|
|
* @param array $tags 标签数组
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
protected function removeTags($html,$tags)
|
|
|
|
{
|
|
|
|
$tag_str = '';
|
|
|
|
if(count($tags))
|
|
|
|
{
|
|
|
|
foreach ($tags as $tag) {
|
|
|
|
$tag_str .= $tag_str?','.$tag:$tag;
|
|
|
|
}
|
2017-09-22 12:05:29 +08:00
|
|
|
// phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
|
2017-09-22 01:51:46 +08:00
|
|
|
$doc = phpQuery::newDocumentHTML($html);
|
|
|
|
pq($doc)->find($tag_str)->remove();
|
|
|
|
$html = pq($doc)->htmlOuter();
|
|
|
|
$doc->unloadDocument();
|
|
|
|
}
|
|
|
|
return $html;
|
|
|
|
}
|
2020-03-05 22:18:54 +08:00
|
|
|
|
|
|
|
protected function destroyDocument()
|
|
|
|
{
|
2020-03-13 13:49:36 +08:00
|
|
|
if($this->document instanceof phpQueryObject) {
|
|
|
|
$this->document->unloadDocument();
|
|
|
|
}
|
2020-03-05 22:18:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
public function __destruct()
|
|
|
|
{
|
|
|
|
$this->destroyDocument();
|
|
|
|
}
|
|
|
|
}
|