Coder PJ皮皮鲁

php提取网页正文

2016-11-12

因为业务需要爬取网站的主要内容信息,因此在网上找了一段提取网页正文的代码,这段代码的主要逻辑是提取网页中的<p></p>标签中的正文,然后找到其父标签中class类型,id类型,内容长度进行加分减分,最后得分最高的就是最后要提取的正文,该算法的缺点是内容正文不用<p></p>标签就不会正确的提取出正文.
代码如下:

<?php
class Readability {
    // 保存判定结果的标记位名称
    const ATTR_CONTENT_SCORE = "contentScore";
    // DOM 解析类目前只支持 UTF-8 编码
    const DOM_DEFAULT_CHARSET = "utf-8";
    // 当判定失败时显示的内容
    const MESSAGE_CAN_NOT_GET = "Readability was unable to parse this page for content.";
    // DOM 解析类（PHP5 已内置）
    protected $DOM = null;
    // 需要解析的源代码
    protected $source = "";
    // 章节的父元素列表
    private $parentNodes = array();
    // 需要删除的标签
    // Note: added extra tags from https://github.com/ridcully
    private $junkTags = Array("style", "form", "iframe", "script", "button", "input", "textarea", 
                                "noscript", "select", "option", "object", "applet", "basefont",
                                "bgsound", "blink", "canvas", "command", "menu", "nav", "datalist",
                                "embed", "frame", "frameset", "keygen", "label", "marquee", "link");
    // 需要删除的属性
    private $junkAttrs = Array("style", "class", "onclick", "onmouseover", "align", "border", "margin");
    /**
     * 构造函数
     *      @param $input_char 字符串的编码。默认 utf-8，可以省略
     */
    function __construct($source, $input_char = "utf-8") {
        $this->source = $source;
        // DOM 解析类只能处理 UTF-8 格式的字符
        $source = mb_convert_encoding($source, 'HTML-ENTITIES', $input_char);
        // 预处理 HTML 标签，剔除冗余的标签等
        $source = $this->preparSource($source);
        // 生成 DOM 解析类
        $this->DOM = new DOMDocument('1.0', $input_char);
        try {
            //libxml_use_internal_errors(true);
            // 会有些错误信息，不过不要紧 :^)
            if (!@$this->DOM->loadHTML('<?xml encoding="'.Readability::DOM_DEFAULT_CHARSET.'">'.$source)) {
                throw new Exception("Parse HTML Error!");
            }
            foreach ($this->DOM->childNodes as $item) {
                if ($item->nodeType == XML_PI_NODE) {
                    $this->DOM->removeChild($item); // remove hack
                }
            }
            // insert proper
            $this->DOM->encoding = Readability::DOM_DEFAULT_CHARSET;
        } catch (Exception $e) {
            // ...
        }
    }
    /**
     * 预处理 HTML 标签，使其能够准确被 DOM 解析类处理
     *
     * @return String
     */
    private function preparSource($string) {
        // 剔除多余的 HTML 编码标记，避免解析出错
        preg_match("/charset=([\w|\-]+);?/", $string, $match);
        if (isset($match[1])) {
            $string = preg_replace("/charset=([\w|\-]+);?/", "", $string, 1);
        }
        // Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
        $string = preg_replace("/<br\/?>[ \r\n\s]*<br\/?>/i", "</p><p>", $string);
        $string = preg_replace("/<\/?font[^>]*>/i", "", $string);
        // @see https://github.com/feelinglucky/php-readability/issues/7
        //   - from http://stackoverflow.com/questions/7130867/remove-script-tag-from-html-content
        $string = preg_replace("#<script(.*?)>(.*?)</script>#is", "", $string);
        return trim($string);
    }
    /**
     * 删除 DOM 元素中所有的 $TagName 标签
     *
     * @return DOMDocument
     */
    private function removeJunkTag($RootNode, $TagName) {
        
        $Tags = $RootNode->getElementsByTagName($TagName);
        
        //Note: always index 0, because removing a tag removes it from the results as well.
        while($Tag = $Tags->item(0)){
            $parentNode = $Tag->parentNode;
            $parentNode->removeChild($Tag);
        }
        
        return $RootNode;
        
    }
    /**
     * 删除元素中所有不需要的属性
     */
    private function removeJunkAttr($RootNode, $Attr) {
        $Tags = $RootNode->getElementsByTagName("*");
        $i = 0;
        while($Tag = $Tags->item($i++)) {
            $Tag->removeAttribute($Attr);
        }
        return $RootNode;
    }
    /**
     * 根据评分获取页面主要内容的盒模型
     *      判定算法来自：http://code.google.com/p/arc90labs-readability/   
     *      这里由郑晓博客转发
     * @return DOMNode
     */
    private function getTopBox() {
        // 获得页面所有的章节
        $allParagraphs = $this->DOM->getElementsByTagName("p");
        // Study all the paragraphs and find the chunk that has the best score.
        // A score is determined by things like: Number of <p>'s, commas, special classes, etc.
        $i = 0;
        while($paragraph = $allParagraphs->item($i++)) {
            $parentNode   = $paragraph->parentNode;
            $contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));
            $className    = $parentNode->getAttribute("class");
            $id           = $parentNode->getAttribute("id");
            // Look for a special classname
            if (preg_match("/(comment|meta|footer|footnote)/i", $className)) {
                $contentScore -= 50;
            } else if(preg_match(
                "/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i",
                $className)) {
                $contentScore += 25;
            }
            // Look for a special ID
            if (preg_match("/(comment|meta|footer|footnote)/i", $id)) {
                $contentScore -= 50;
            } else if (preg_match(
                "/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i",
                $id)) {
                $contentScore += 25;
            }
            // Add a point for the paragraph found
            // Add points for any commas within this paragraph
            if (strlen($paragraph->nodeValue) > 10) {
                $contentScore += strlen($paragraph->nodeValue);
            }
            // 保存父元素的判定得分
            $parentNode->setAttribute(Readability::ATTR_CONTENT_SCORE, $contentScore);
            // 保存章节的父元素，以便下次快速获取
            array_push($this->parentNodes, $parentNode);
        }
        $topBox = null;
        
        // Assignment from index for performance. 
        //     See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 
        for ($i = 0, $len = sizeof($this->parentNodes); $i < $len; $i++) {
            $parentNode      = $this->parentNodes[$i];
            $contentScore    = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));
            $orgContentScore = intval($topBox ? $topBox->getAttribute(Readability::ATTR_CONTENT_SCORE) : 0);
            if ($contentScore && $contentScore > $orgContentScore) {
                $topBox = $parentNode;
            }
        }
        
        // 此时，$topBox 应为已经判定后的页面内容主元素
        return $topBox;
    }
    /**
     * 获取 HTML 页面标题
     *
     * @return String
     */
    public function getTitle() {
        $split_point = ' - ';
        $titleNodes = $this->DOM->getElementsByTagName("title");
        if ($titleNodes->length 
            && $titleNode = $titleNodes->item(0)) {
            // @see http://stackoverflow.com/questions/717328/how-to-explode-string-right-to-left
            $title  = trim($titleNode->nodeValue);
            $result = array_map('strrev', explode($split_point, strrev($title)));
            return sizeof($result) > 1 ? array_pop($result) : $title;
        }
        return null;
    }
    /**
     * Get Leading Image Url
     *
     * @return String
     */
    public function getLeadImageUrl($node) {
        $images = $node->getElementsByTagName("img");
        if ($images->length && $leadImage = $images->item(0)) {
            return $leadImage->getAttribute("src");
        }
        return null;
    }
    /**
     * 获取页面的主要内容（Readability 以后的内容）
     *
     * @return Array
     */
    public function getContent() {
        if (!$this->DOM) return false;
        // 获取页面标题
        $ContentTitle = $this->getTitle();
        // 获取页面主内容
        $ContentBox = $this->getTopBox();
        
        //Check if we found a suitable top-box.
        if($ContentBox === null)
            throw new RuntimeException(Readability::MESSAGE_CAN_NOT_GET);
        
        // 复制内容到新的 DOMDocument
        $Target = new DOMDocument;
        $Target->appendChild($Target->importNode($ContentBox, true));
        // 删除不需要的标签
        foreach ($this->junkTags as $tag) {
            $Target = $this->removeJunkTag($Target, $tag);
        }
        // 删除不需要的属性
        foreach ($this->junkAttrs as $attr) {
            $Target = $this->removeJunkAttr($Target, $attr);
        }
        $content = mb_convert_encoding($Target->saveHTML(), Readability::DOM_DEFAULT_CHARSET, "HTML-ENTITIES");
        // 多个数据，以数组的形式返回
        return Array(
            'lead_image_url' => $this->getLeadImageUrl($Target),
            'word_count' => mb_strlen(strip_tags($content), Readability::DOM_DEFAULT_CHARSET),
            'title' => $ContentTitle ? $ContentTitle : null,
            'content' => $content
        );
    }
    function __destruct() { }
}
$url = "";
$content = file_get_contents($url);
$article = new Readability($content);
$data = $article->getContent();
echo $data['content'];

静态文件服务

2016-11-08

最近写程序发现静态文件的路径不对,因此写了一段小的代码验证路径的起始位置

package main
import (
            "net/http"
       )
func main() {
        http.Handle("/", http.FileServer(http.Dir("./")))
        http.ListenAndServe(":8111", nil)
}

PHP多进程并发同步

2016-11-07

在公司做的一个红包系统中出现了并发同步问题，原本应该有唯一性的数据出现了重复，究其原因是因为PHP在判断用户的发放记录时出现了脏读，因此导致数据库的数据出问题，因为PHP没有进程锁的机制，所以参考网上资料使用PHP的文件锁，对并发进程进行锁定来强制同步。

<?php
$fp = fopen("lock.txt", "w+");
if(flock($fp,LOCK_EX))
{
        //..处理订单
        flock($fp,LOCK_UN);
}
 
fclose($fp);

Go实现工作池

2016-11-01

package main
import "fmt"
import "time"
func worker(id int, jobs <-chan int, results chan<- int) {
    for j := range jobs {
        fmt.Println("worker", id, "processing job", j)
            time.Sleep(time.Second)
            results <- j * 2
     }
}
func main() {
    //两个channel，一个用来放置工作项，一个用来存放处理结果。
    jobs := make(chan int, 100)
    results := make(chan int, 100)
    // 开启三个线程，也就是说线程池中只有3个线程，实际情况下，我们可以根据需要动态增加或减少线程。
    for w := 1; w <= 3; w++ {
      go worker(w, jobs, results)
    }
      // 添加9个任务后关闭Channel
      for j := 1; j <= 9; j++ {
          jobs <- j
      }
      close(jobs)
      //获取所有的处理结果
      for a := 1; a <= 9; a++ {
          <-results
      }
}

编程书籍网站

2016-10-30

偶尔会在网上找些资料和书籍,以前在微博上看到的SaltTiger会定期的发布些相关的书籍.
另外还可以在avxhome 和 Fox eBook找到一些编程相关的书籍
记录一下,做个备忘

php多进程

2016-10-30

由于php没有多进程，不能直接操控线程/进程。所以只能依赖于linux来实现多进程。
php函数pcntl_fork()可以创建进程，等同于linux的fork。
和fork不同的是，pcntl_fork返回的0是子进程，返回的id是子进程的pid（而fork是父进程返回0，子进程返回pid).

使用go获取url地址302后的真正url

2016-10-23

因为某种需求,需要获取某篇文章在302跳转后真正的URL地址,既然需求有了就需要来解决,在看了网上各种的解决方法后觉得有些复杂,因此就有了以下的解决方法,如果这个方法不好,或有更好的解决方法欢迎在评论出指出,谢谢.

golang爬虫

2016-10-22

因为收藏了某些网站中的内容,收藏的内容太多而且网站不提供检索功能,因此需要将收藏的内容获取下来,然后进行检索.所以就写了这个小爬虫进行数据获取并保存的mysql中.

package main
import (
	"bytes"
	"database/sql"
	"fmt"
	_ "github.com/go-sql-driver/mysql"
	"github.com/opesun/goquery"
	"io"
	"io/ioutil"
	"net/http"
	"os"
	"strconv"
	"strings"
)
func main() {
	var i = 1
	db, err := sql.Open("mysql", "user:pwd@tcp(localhost:3306)/dbname?charset=utf8")
	checkErr(err)
	cookie := ""
	for {
		content := httpGet(i, cookie)
		parseHtml(content, db)
		i++
	}
	defer db.Close()
}
func httpGet(i int, cookie string) io.ReadCloser {
	url := "http://xxxxx.io/favorites?page=" + strconv.Itoa(i)
	fmt.Println(url)
	client := &http.Client{}
	req, err := http.NewRequest("GET", url, nil)
	if err != nil {
		print(err)
		os.Exit(1)
	}
	req.Header.Set("Pragma", "no-cache")
	req.Header.Set("Accept-Language", "zh-CN,zh;q=0.8")
	req.Header.Set("Upgrade-Insecure-Requests", "1")
	req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
	req.Header.Set("Connection", "keep-alive")
	req.Header.Set("Cookie", cookie)
	resp, err := client.Do(req)
	checkErr(err)
	body, err := ioutil.ReadAll(resp.Body)
	//ioutil.ReadAll 在读完io.Reader后会将io.Reader清空,因此需要将其恢复
	buf := bytes.NewBuffer(body)
	resp.Body = ioutil.NopCloser(buf)
	flag := strings.Contains(string(body), "您还没有任何收藏")
	if flag {
		os.Exit(0)
	}
	return resp.Body
}
func parseHtml(content io.ReadCloser, db *sql.DB) {
	p, err := goquery.Parse(content)
	checkErr(err)
	favorites := p.Find(".post")
	for i := 0; i < favorites.Length(); i++ {
		d := favorites.Eq(i)
		title := d.Find(".title")
		link := title.Find("a")
		titleText := title.Text()
		linkText := link.Attr("href")
		titleText = strings.TrimSpace(titleText)
		fmt.Println(title.Text())
		fmt.Println(link.Attr("href"))
		save(titleText, linkText, db)
	}
}
func save(title, link string, db *sql.DB) {
	flag := query(db, title, link) //记录存在就不出理直接返回
	if flag {
		return
	}
	stmt, err := db.Prepare("INSERT toutiao SET title=?,link=?")
	checkErr(err)
	res, err := stmt.Exec(title, link)
	id, err := res.LastInsertId()
	checkErr(err)
	fmt.Println(id)
}
func query(db *sql.DB, title, link string) bool {
	var sql = "select id from toutiao where title = ? and link = ?"
	var id int
	err := db.QueryRow(sql, title, link).Scan(&id)
	if err != nil || id == 0 {
		return false
	}
	return true
}
func checkErr(err error) {
	if err != nil {
		panic(err)
	}
}

php编译选项

2016-10-22

因为有些时候要自己手动编译php,因此在这里做个简单的记录,下列主要是针对php5.3留的选项,最新的php7选项会有小的差异

./configure --prefix=/opt/php5.3 \
                     --with-config-file-path=/opt/etc/php \
                     --enable-fpm \
                     --enable-pcntl \
                     --enable-mysqlnd \
                     --enable-opcache \
                     --enable-sockets \
                     --enable-sysvmsg \
                     --enable-sysvsem \
                     --enable-sysvshm \
                     --enable-shmop \
                     --enable-zip \
                     --enable-soap \
                     --enable-xml \
                     --enable-mbstring \
                     --disable-rpath \
                     --disable-debug \
                     --disable-fileinfo \
                     --with-mysql \
                     --with-mysqli=mysqlnd \
                     --with-pdo-mysql=mysqlnd \
                     --with-pcre-regex \
                     --with-iconv \
                     --with-zlib \
                     --with-mcrypt \
                     --with-gd \
                     --with-openssl \
                     --with-mhash \
                     --with-xmlrpc \
                     --with-curl \
                     --with-imap-ssl

Hello World

2016-10-21

Welcome to Hexo! This is your very first post. Check documentation for more info. If you get any problems when using Hexo, you can find the answer in troubleshooting or you can ask me on GitHub.