天天看點

PHP搜尋引擎

簡單PHP搜尋引擎源代碼,需要開啟PHP的cURL擴充。功能:對某一網址進行檢索,擷取網站基本資訊,同時提取網站的所有連接配接。

<?php

class Engine{

    private $_url = '';      //URL位址

    private $_sites = '';    //頁面資訊

    public function __construct($url){

        $this->_url = $url;

    }

    //啟動引擎

    public function start(){

        //$content = $this->socketOpen($this->_url);

        $content = $this->getContent($this->_url);

        $this->_sites['url'] = $this->_url;

        $this->_sites['meta'] = $this->getMeta($content);

        $this->_sites['title'] = $this->getTitle($content);

        //$this->_sites['detail'] = $this->getDetail($content);

        $this->_sites['links'] = $this->getLinks($content);

    }

    //擷取meta内容

    public function getMeta($content){

        $file = 'metaCache';

        file_put_contents($file,$content);

        $meta = get_meta_tags($file);

        return $meta;

    }

    //擷取body内容

    public function getDetail($content){

        preg_match('/<body>(.*?)<\/body>/i',$content,$matchs);

        $body = $this->stripHTML($matchs[1]);

        return substr($body,0,400);

    }

    //擷取title内容

    public function getTitle($content){

        preg_match('/<title>(.+)<\/title>/i',$content,$matchs);

        return $matchs[1];

    }

    //擷取a連結

    public function getLinks($content){

        $pat = '/<a[^>](.*?)href="(.*?)" target="_blank" rel="external nofollow" (.*?)>(.*?)<\/a>/i';

        preg_match_all($pat,$content,$matchs);

        $result['href'] = $matchs[2];

        $result['name'] = $this->stripTags($matchs[4]);

        return $result;

    }

    //Socket監聽

    public function socketOpen($url){

        $fp = fsockopen($url,80,$errno,$errstr,30);

        if($fp === false){

            echo "連接配接失敗:$errstr($errno)<br/>";

            return false;

        }

        else{

            $out = "GET/HTTP/1.1\r\n";

            $out .= "Host:$url\r\n";

            $out .= "Connection:Close\r\n";

            fwrite($fp,$out);

            $content = '';

            while(!feof($fp)){

                $content .= fgets($fp,1024);

            }

            fclose($fp);

            var_dump($content);exit;

            return $content;

        }

    }

    //擷取指定url内容

    public function getContent($url){

        $ch = @curl_init($url);

        @curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.0)");

        ob_start();

        $result = @curl_exec($ch);

        $content = ob_get_clean();

        ob_end_clean();

        @curl_close($ch);

        return $content;

    }

    //取出script和style标簽

    public function stripHTML($string){

        $pat = array(

            "/<script[^>].*?>.*?<\/script>/i",

            "/<style[^>].*?>.*?<\/style>/i"

        );

        $rep = array('','');

        return preg_replace($pat,$rep,$string);

    }

    //去除數組元素的标簽

    public function stripTags(&$arr){

        foreach ($arr as $key => $val )

        {

            if(is_array($val)){

                $this->stripTags($arr[$key]);

            }

            else{

                $arr[$key] = strip_tags($val);

            }

        }

        return $arr;

    }

    function show(){

        echo "<pre>";

        print_r($this->_sites);

        echo "</pre>";

    }

    //End Class Engine

}

$engine = new Engine('http://www.163.com');

$engine->start();

$engine->show();

?>

這隻是引擎的主要部分,接下來要做的就是把相關資訊存入資料庫,然後接着對所有擷取的連接配接再去檢索,然後把相關資訊再存入資料庫,那麼核心部分就是我們擷取了這些資訊之後根據資訊内容來設定網站的關鍵字,然後給他一個排名,供以後搜尋。

本文來自PHP100