本文示例借助正则表达式regexp进行语法识别,抓取网页数据:
代码:
url='http://quote.eastmoney.com/stock_list.html';
[str status]=urlread(url,'Charset','GBK');
%上海股票
suf='ss';
scmp='<li><a target="_blank" href="http://quote.eastmoney.com/sh\d+\.html" target="_blank" rel="external nofollow" >(.{1,10})\((\d+)\)</a></li>';
%深圳股票
%suf='sz';
%scmp='<li><a target="_blank" href="http://quote.eastmoney.com/sz\d+\.html" target="_blank" rel="external nofollow" >(.{1,10})\((\d+)\)</a></li>';
if status
sdata=regexp(str,scmp,'tokens');
else
error('download error');
end
ls=length(sdata);
s=cell(ls,2);
for i=1:ls
s{i,1}=[sdata{i}{2},'.',suf];
s{i,2}=sdata{i}{1};
end
root=[pwd, '\'];
filename=[root,'stocklist_',suf,'