天天看点

将Html原码解析成IHTMLDocumet对象,然后使用DOMNode将html显示成一棵树

功能:

    将Html原码解析成IHTMLDocumet2对象,然后将IHTMLDocumet2转换成IHTMLDocumet3,使用DOMNode,将html显示成一棵树。此解析不执行任何脚本,不从网上下载任何资料,是一个纯文本的解析。      

    (方法 Parse(string str) 一个轻量级Parsing 实现。这个代码不会从网上下载任何资料,也不会执行任何脚本,纯属Parsing。

Parsing是通过MSHTML的Markup Service实现的。要正确使用这个代码,需要添加MSHTML引用。)

      要正确编译如下代码,还需要修改unsafe(启用不安全模式)编译器选项,将其开启。

方法:在“项目”->“<应用程序名称>属性”对话框中打开“配置属性”,选中“生成”项,修改“允许不安全代码块”的内容为true.

[C#]

using System;

using System.Drawing;

using System.Collections;

using System.ComponentModel;

using System.Windows.Forms;

using System.Data;

using mshtml;

using System.Runtime.InteropServices;

using System.IO;

namespace WindowsApplication1

{

 [ComVisible(true), ComImport(), Guid("7FD52380-4E07-101B-AE2D-08002B2EC713") , InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]

 public interface IPersistStreamInit 

 {

  void GetClassID([In, Out] ref Guid pClassID);

  [return: MarshalAs(UnmanagedType.I4)] [PreserveSig]

  int IsDirty();

  void Load([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);

  void Save([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm, 

   [In, MarshalAs(UnmanagedType.I4)] int fClearDirty);

  void GetSizeMax([Out, MarshalAs(UnmanagedType.LPArray)] long pcbSize);

  void InitNew();

 } 

  /// <summary>

 /// Form1 的摘要说明。

 /// </summary>

 public class Form1 : System.Windows.Forms.Form

 {

  private System.Windows.Forms.Button button1;

  private System.Windows.Forms.TreeView treeView1;

  /// <summary>

  /// 必需的设计器变量。

  /// </summary>

  private System.ComponentModel.Container components = null;

  public Form1()

  {

   //

   // Windows 窗体设计器支持所必需的

   //

   InitializeComponent();

   //

   // TODO: 在 InitializeComponent 调用后添加任何构造函数代码

   //

  }

  /// <summary>

  /// 清理所有正在使用的资源。

  /// </summary>

  protected override void Dispose( bool disposing )

  {

   if( disposing )

   {

    if (components != null)

    {

     components.Dispose();

    }

   }

   base.Dispose( disposing );

  }

  #region Windows 窗体设计器生成的代码

  /// <summary>

  /// 设计器支持所需的方法 - 不要使用代码编辑器修改

  /// 此方法的内容。

  /// </summary>

  private void InitializeComponent()

  {

   this.button1 = new System.Windows.Forms.Button();

   this.treeView1 = new System.Windows.Forms.TreeView();

   this.SuspendLayout();

   //

   // button1

   //

   this.button1.Location = new System.Drawing.Point(24, 16);

   this.button1.Name = "button1";

   this.button1.Size = new System.Drawing.Size(88, 24);

   this.button1.TabIndex = 0;

   this.button1.Text = "button1";

   this.button1.Click += new System.EventHandler(this.button1_Click);

   //

   // treeView1

   //

   this.treeView1.ImageIndex = -1;

   this.treeView1.Location = new System.Drawing.Point(280, 96);

   this.treeView1.Name = "treeView1";

   this.treeView1.SelectedImageIndex = -1;

   this.treeView1.Size = new System.Drawing.Size(288, 224);

   this.treeView1.TabIndex = 1;

   //

   // Form1

   //

   this.AutoScaleBaseSize = new System.Drawing.Size(6, 14);

   this.ClientSize = new System.Drawing.Size(664, 333);

   this.Controls.Add(this.treeView1);

   this.Controls.Add(this.button1);

   this.Name = "Form1";

   this.Text = "Form1";

   this.ResumeLayout(false);

  }

  #endregion

  /// <summary>

  /// 应用程序的主入口点。

  /// </summary>

  [STAThread]

  static void Main()

  {

   Application.Run(new Form1());

  }

  unsafe IHTMLDocument2  Parse(string s)

  {

   IHTMLDocument2 pDocument=new HTMLDocumentClass();  

   if(pDocument!=null)

   {

    IPersistStreamInit pPersist=pDocument as IPersistStreamInit ;

    pPersist.InitNew();

    pPersist=null;

    IMarkupServices ms=pDocument as IMarkupServices ;

    if(ms!=null)

    {

     IMarkupContainer pMC=null;

     IMarkupPointer pStart,pEnd;

     ms.CreateMarkupPointer(out pStart);

     ms.CreateMarkupPointer(out pEnd);

     System.Text.StringBuilder sb=new System.Text.StringBuilder(s); 

     IntPtr pSource=Marshal.StringToHGlobalUni(s);

     ms.ParseString(ref *(ushort*)pSource.ToPointer(),0,out pMC,pStart,pEnd);

     if(pMC!=null)

     {

      Marshal.Release(pSource);

      return pMC as IHTMLDocument2;

     }

     Marshal.Release(pSource);

    }

   }

   return null;

  }

  private void button1_Click(object sender, System.EventArgs e)

  {

   string html="";

   string filename="D://NetC#Program//html//163.htm";

   if (!File.Exists(filename))

   {

    Console.WriteLine("文件不存在");

    return;

   }

   StreamReader sr1 = new StreamReader(

    (System.IO.Stream)File.OpenRead(filename),System.Text.Encoding.Default);

   html="";

   while (sr1.Peek()>-1)

   {

    html=html+sr1.ReadToEnd();

   }

   sr1.Close();

   IHTMLDocument2 doc2 = Parse(html);

   Console.WriteLine(doc2.styleSheets.length);

   IHTMLDocument3 HTMLDocument=(IHTMLDocument3)doc2;

   IHTMLDOMNode rootDomNode=(IHTMLDOMNode)HTMLDocument.documentElement;

   TreeNode root=treeView1.Nodes.Add("HTML");

   InsertDOMNodes(rootDomNode,root);

  }

  private void InsertDOMNodes(IHTMLDOMNode parentnode,TreeNode tree_node)

  {

   if(parentnode.hasChildNodes())//是否有子结点

   {

    IHTMLDOMChildrenCollection allchild = (IHTMLDOMChildrenCollection)parentnode.childNodes;

    int length = allchild.length;

    for(int i=0;i<length;i++)//对每个子结点进行处理,首先取出每个子节点的属性,然后进行递归

    {

     IHTMLDOMNode child_node = (IHTMLDOMNode)allchild.item(i);

     string m_snodeName  =child_node.nodeName;

     object m_onodevalue =child_node.nodeValue;

     string m_snodetype  =child_node.nodeType.ToString();

     string m_snodevalue ="";

     if ( m_onodevalue!=null)

      m_snodevalue =m_onodevalue.ToString().Trim();

     TreeNode tempnode=null;

     if (child_node.nodeName.Equals("#text"))

     {

      if ((m_snodevalue!=null)&& (!m_snodevalue.Equals("")))

      {

       tempnode = tree_node.Nodes.Add(m_snodevalue);

      }

     }

     else

     {

      tempnode = tree_node.Nodes.Add(child_node.nodeName);

      InsertDOMNodes(child_node,tempnode);

     }

    }

   }

  }

 }

}