PHP搜尋引擎

簡單PHP搜尋引擎源代碼，需要開啟PHP的cURL擴充。功能：對某一網址進行檢索，擷取網站基本資訊，同時提取網站的所有連接配接。

<?php

class Engine{

private $_url = ''; //URL位址

private $_sites = ''; //頁面資訊

public function __construct($url){

$this->_url = $url;

}

//啟動引擎

public function start(){

//$content = $this->socketOpen($this->_url);

$content = $this->getContent($this->_url);

$this->_sites['url'] = $this->_url;

$this->_sites['meta'] = $this->getMeta($content);

$this->_sites['title'] = $this->getTitle($content);

//$this->_sites['detail'] = $this->getDetail($content);

$this->_sites['links'] = $this->getLinks($content);

}

//擷取meta内容

public function getMeta($content){

$file = 'metaCache';

file_put_contents($file,$content);

$meta = get_meta_tags($file);

return $meta;

}

//擷取body内容

public function getDetail($content){

preg_match('/<body>(.*?)<\/body>/i',$content,$matchs);

$body = $this->stripHTML($matchs[1]);

return substr($body,0,400);

}

//擷取title内容

public function getTitle($content){

preg_match('/<title>(.+)<\/title>/i',$content,$matchs);

return $matchs[1];

}

//擷取a連結

public function getLinks($content){

$pat = '/<a[^>](.*?)href="(.*?)" target="_blank" rel="external nofollow" (.*?)>(.*?)<\/a>/i';

preg_match_all($pat,$content,$matchs);

$result['href'] = $matchs[2];

$result['name'] = $this->stripTags($matchs[4]);

return $result;

}

//Socket監聽

public function socketOpen($url){

$fp = fsockopen($url,80,$errno,$errstr,30);

if($fp === false){

echo "連接配接失敗:$errstr($errno)<br/>";

return false;

}

else{

$out = "GET/HTTP/1.1\r\n";

$out .= "Host:$url\r\n";

$out .= "Connection:Close\r\n";

fwrite($fp,$out);

$content = '';

while(!feof($fp)){

$content .= fgets($fp,1024);

}

fclose($fp);

var_dump($content);exit;

return $content;

}

//擷取指定url内容

public function getContent($url){

$ch = @curl_init($url);

@curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.0)");

ob_start();

$result = @curl_exec($ch);

$content = ob_get_clean();

ob_end_clean();

@curl_close($ch);

return $content;

}

//取出script和style标簽

public function stripHTML($string){

$pat = array(

"/<script[^>].*?>.*?<\/script>/i",

"/<style[^>].*?>.*?<\/style>/i"

);

$rep = array('','');

return preg_replace($pat,$rep,$string);

}

//去除數組元素的标簽

public function stripTags(&$arr){

foreach ($arr as $key => $val )

{

if(is_array($val)){

$this->stripTags($arr[$key]);

}

else{

$arr[$key] = strip_tags($val);

}

return $arr;

}

function show(){

echo "<pre>";

print_r($this->_sites);

echo "</pre>";

}

//End Class Engine

}

$engine = new Engine('http://www.163.com');

$engine->start();

$engine->show();

這隻是引擎的主要部分，接下來要做的就是把相關資訊存入資料庫，然後接着對所有擷取的連接配接再去檢索，然後把相關資訊再存入資料庫，那麼核心部分就是我們擷取了這些資訊之後根據資訊内容來設定網站的關鍵字，然後給他一個排名，供以後搜尋。

本文來自PHP100

PHP搜尋引擎

繼續閱讀

Centos 7 Apache配置虛拟主機

PHP進階學習之session寫入資料庫

php寫一個簡潔的登入頁面

Apache與PHP環境下配置本地虛拟主機

Testlink安裝部署之XAMPP

TestLink 圖表中文亂碼問題

ecshop屬性排序

Ubuntu16.04安裝Apache+MySQL+PHP1. 安裝Apache2. 安裝MySQL3. 安裝PHP4. 安裝phpMyAdmin

版本号隐藏

Apache配置SSLApache配置SSL

配置apache支援PHP（win7）

Cloud Studio初體驗

30天了解30種技術系列---(10)面向Cloud的搜尋引擎 ElasticSearch

NOSQL安全攻擊

php 去掉字元串的最後一個字元及截取原字元串1,2,3,4,5,6,

php——水印