QueryList/src/Dom/Query.php

317 lines
8.7 KiB
PHP
Raw Normal View History

2017-09-22 01:51:46 +08:00
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/21
*/
namespace QL\Dom;
2018-12-10 19:23:15 +08:00
use Tightenco\Collect\Support\Collection;
2017-09-22 01:51:46 +08:00
use phpQuery;
2020-03-13 13:49:36 +08:00
use phpQueryObject;
2017-09-22 01:51:46 +08:00
use QL\QueryList;
2017-09-24 15:11:44 +08:00
use Closure;
2017-09-22 01:51:46 +08:00
class Query
{
protected $html;
/**
* @var \phpQueryObject
*/
2017-09-22 01:51:46 +08:00
protected $document;
protected $rules;
protected $range = null;
protected $ql;
2017-09-24 15:11:44 +08:00
/**
* @var Collection
*/
protected $data;
2017-09-22 01:51:46 +08:00
public function __construct(QueryList $ql)
{
$this->ql = $ql;
}
2017-10-09 01:48:56 +08:00
/**
2020-03-22 17:19:57 +08:00
* @param bool $rel
* @return String
2017-10-09 01:48:56 +08:00
*/
2020-03-22 17:19:57 +08:00
public function getHtml($rel = true)
2017-09-22 01:51:46 +08:00
{
2020-03-22 17:19:57 +08:00
return $rel ? $this->document->htmlOuter() : $this->html;
2017-09-22 01:51:46 +08:00
}
2017-10-09 01:48:56 +08:00
/**
* @param $html
* @param null $charset
* @return QueryList
*/
2017-09-22 19:09:43 +08:00
public function setHtml($html, $charset = null)
2017-09-22 01:51:46 +08:00
{
2017-09-22 02:38:46 +08:00
$this->html = value($html);
2020-03-13 13:49:36 +08:00
$this->destroyDocument();
2020-04-01 22:03:50 +08:00
$this->document = phpQuery::newDocumentHTML($this->html, $charset);
2017-09-22 01:51:46 +08:00
return $this->ql;
}
2017-10-09 01:48:56 +08:00
/**
* Get crawl results
*
* @param Closure|null $callback
* @return Collection|static
*/
2017-09-24 15:11:44 +08:00
public function getData(Closure $callback = null)
{
2020-04-01 22:03:50 +08:00
return $this->handleData($this->data, $callback);
2017-09-24 15:11:44 +08:00
}
2017-10-09 01:48:56 +08:00
/**
* @param Collection $data
*/
2017-09-25 14:15:26 +08:00
public function setData(Collection $data)
2017-09-24 15:11:44 +08:00
{
$this->data = $data;
}
2017-10-09 01:48:56 +08:00
/**
* Searches for all elements that match the specified expression.
*
* @param $selector A string containing a selector expression to match elements against.
* @return Elements
*/
2017-09-22 01:51:46 +08:00
public function find($selector)
{
return (new Dom($this->document))->find($selector);
}
2017-10-09 01:48:56 +08:00
/**
* Set crawl rule
*
* $rules = [
* 'rule_name1' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
* 'rule_name2' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
* // ...
* ]
*
* @param array $rules
* @return QueryList
*/
2017-09-22 01:51:46 +08:00
public function rules(array $rules)
{
$this->rules = $rules;
return $this->ql;
}
2017-10-09 01:48:56 +08:00
/**
* Set the slice area for crawl list
*
* @param $selector
* @return QueryList
*/
public function range($selector)
2017-09-22 01:51:46 +08:00
{
2017-10-09 01:48:56 +08:00
$this->range = $selector;
2017-09-22 01:51:46 +08:00
return $this->ql;
}
2017-10-09 01:48:56 +08:00
/**
* Remove HTML head,try to solve the garbled
*
* @return QueryList
*/
2017-09-22 01:51:46 +08:00
public function removeHead()
{
2020-04-01 22:03:50 +08:00
$html = preg_replace('/<head.+?>.+<\/head>/is', '<head></head>', $this->html);
2017-09-22 01:51:46 +08:00
$this->setHtml($html);
return $this->ql;
}
2017-10-09 01:48:56 +08:00
/**
* Execute the query rule
*
* @param Closure|null $callback
* @return QueryList
*/
2017-09-24 15:11:44 +08:00
public function query(Closure $callback = null)
2017-09-22 01:51:46 +08:00
{
2017-09-24 15:11:44 +08:00
$this->data = $this->getList();
2020-04-01 22:03:50 +08:00
$this->data = $this->handleData($this->data, $callback);
2017-09-25 14:15:26 +08:00
return $this->ql;
2017-09-22 01:51:46 +08:00
}
2020-04-01 22:03:50 +08:00
public function handleData(Collection $data, $callback)
{
if (is_callable($callback)) {
if (empty($this->range)) {
2020-04-03 17:33:32 +08:00
$data = new Collection($callback($data->all(), null));
2020-04-01 22:03:50 +08:00
} else {
$data = $data->map($callback);
}
}
return $data;
}
2017-09-22 01:51:46 +08:00
protected function getList()
{
$data = [];
2020-03-15 13:44:22 +08:00
if (empty($this->range)) {
2020-04-01 22:03:50 +08:00
foreach ($this->rules as $key => $reg_value) {
2020-03-15 13:44:22 +08:00
$rule = $this->parseRule($reg_value);
$contentElements = $this->document->find($rule['selector']);
$data[$key] = $this->extractContent($contentElements, $key, $rule);
}
} else {
2020-04-01 22:03:50 +08:00
$rangeElements = $this->document->find($this->range);
2017-09-22 01:51:46 +08:00
$i = 0;
foreach ($rangeElements as $element) {
2020-04-01 22:03:50 +08:00
foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value);
$contentElements = pq($element)->find($rule['selector']);
$data[$i][$key] = $this->extractContent($contentElements, $key, $rule);
2017-09-22 01:51:46 +08:00
}
$i++;
}
}
2020-04-03 17:33:32 +08:00
return new Collection($data);
2017-09-22 01:51:46 +08:00
}
protected function extractContent(phpQueryObject $pqObj, $ruleName, $rule)
{
switch ($rule['attr']) {
case 'text':
$content = $this->allowTags($pqObj->html(), $rule['filter_tags']);
break;
case 'texts':
2020-04-01 22:03:50 +08:00
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->allowTags($element->html(), $rule['filter_tags']);
})->all();
break;
case 'html':
$content = $this->stripTags($pqObj->html(), $rule['filter_tags']);
break;
case 'htmls':
2020-04-01 22:03:50 +08:00
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->html(), $rule['filter_tags']);
})->all();
break;
case 'htmlOuter':
$content = $this->stripTags($pqObj->htmlOuter(), $rule['filter_tags']);
break;
case 'htmlOuters':
2020-04-01 22:03:50 +08:00
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->htmlOuter(), $rule['filter_tags']);
})->all();
break;
default:
$content = $pqObj->attr($rule['attr']);
break;
}
2020-04-01 22:03:50 +08:00
if (is_callable($rule['handle_callback'])) {
$content = call_user_func($rule['handle_callback'], $content, $ruleName);
}
return $content;
}
protected function parseRule($rule)
{
$result = [];
$result['selector'] = $rule[0];
$result['attr'] = $rule[1];
$result['filter_tags'] = $rule[2] ?? '';
$result['handle_callback'] = $rule[3] ?? null;
return $result;
}
2017-09-22 01:51:46 +08:00
/**
* 去除特定的html标签
2020-04-01 22:03:50 +08:00
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
2017-09-22 01:51:46 +08:00
* @return string
*/
2020-04-01 22:03:50 +08:00
protected function stripTags($html, $tags_str)
2017-09-22 01:51:46 +08:00
{
$tagsArr = $this->tag($tags_str);
2020-04-01 22:03:50 +08:00
$html = $this->removeTags($html, $tagsArr[1]);
2017-09-22 01:51:46 +08:00
$p = array();
foreach ($tagsArr[0] as $tag) {
2020-04-01 22:03:50 +08:00
$p[] = "/(<(?:\/" . $tag . "|" . $tag . ")[^>]*>)/i";
2017-09-22 01:51:46 +08:00
}
2020-04-01 22:03:50 +08:00
$html = preg_replace($p, "", trim($html));
2017-09-22 01:51:46 +08:00
return $html;
}
/**
* 保留特定的html标签
2020-04-01 22:03:50 +08:00
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
2017-09-22 01:51:46 +08:00
* @return string
*/
2020-04-01 22:03:50 +08:00
protected function allowTags($html, $tags_str)
2017-09-22 01:51:46 +08:00
{
$tagsArr = $this->tag($tags_str);
2020-04-01 22:03:50 +08:00
$html = $this->removeTags($html, $tagsArr[1]);
2017-09-22 01:51:46 +08:00
$allow = '';
foreach ($tagsArr[0] as $tag) {
$allow .= "<$tag> ";
}
2020-04-01 22:03:50 +08:00
return strip_tags(trim($html), $allow);
2017-09-22 01:51:46 +08:00
}
protected function tag($tags_str)
{
2020-04-01 22:03:50 +08:00
$tagArr = preg_split("/\s+/", $tags_str, -1, PREG_SPLIT_NO_EMPTY);
$tags = array(array(), array());
foreach ($tagArr as $tag) {
if (preg_match('/-(.+)/', $tag, $arr)) {
2017-09-22 01:51:46 +08:00
array_push($tags[1], $arr[1]);
2020-04-01 22:03:50 +08:00
} else {
2017-09-22 01:51:46 +08:00
array_push($tags[0], $tag);
}
}
return $tags;
}
/**
* 移除特定的html标签
2020-04-01 22:03:50 +08:00
* @param string $html
* @param array $tags 标签数组
2017-09-22 01:51:46 +08:00
* @return string
*/
2020-04-01 22:03:50 +08:00
protected function removeTags($html, $tags)
2017-09-22 01:51:46 +08:00
{
$tag_str = '';
2020-04-01 22:03:50 +08:00
if (count($tags)) {
2017-09-22 01:51:46 +08:00
foreach ($tags as $tag) {
2020-04-01 22:03:50 +08:00
$tag_str .= $tag_str ? ',' . $tag : $tag;
2017-09-22 01:51:46 +08:00
}
2017-09-22 12:05:29 +08:00
// phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
2017-09-22 01:51:46 +08:00
$doc = phpQuery::newDocumentHTML($html);
pq($doc)->find($tag_str)->remove();
$html = pq($doc)->htmlOuter();
$doc->unloadDocument();
}
return $html;
}
protected function destroyDocument()
{
2020-04-01 22:03:50 +08:00
if ($this->document instanceof phpQueryObject) {
2020-03-13 13:49:36 +08:00
$this->document->unloadDocument();
}
}
public function __destruct()
{
$this->destroyDocument();
}
}