<?xml version="1.0"?> <configuration> <configSections> <sectionGroup name="userSettings" type="System.Configuration.UserSettingsGroup, System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" > <section name="WebCrawler.Properties.Settings" type="System.Configuration.ClientSettingsSection, System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" allowExeDefinition="MachineToLocalUser" requirePermission="false" /> </sectionGroup> </configSections> <startup> <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.0"/> </startup> <userSettings> <WebCrawler.Properties.Settings> <setting name="LogPath" serializeAs="String"> <value>C:tempWebCrawlerErrorLog.txt</value> </setting> <setting name="XMLPath" serializeAs="String"> <value>C:tempWebCrawlerWebCrawlerLog.xml</value> </setting> <setting name="URLPath" serializeAs="String"> <value /> </setting> <setting name="TrimTitle" serializeAs="String"> <value /> </setting> </WebCrawler.Properties.Settings> </userSettings> </configuration>
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Text; using System.Windows.Forms; using System.IO; using System.Linq; using System.Text.RegularExpressions; using System.Xml.Linq; using System.Diagnostics; using System.Net; using System.Web; namespace WebCrawler { public partial class Form1 : Form { public static string sTrimTitle = ""; public static string sSourceURL = ""; public Form1() { InitializeComponent(); } private void Form1_Load(object sender, EventArgs e) { if (Properties.Settings.Default.XMLPath != "") { txtXMLPath.Text = Properties.Settings.Default.XMLPath; Directory.CreateDirectory(Path.GetDirectoryName(txtXMLPath.Text)); } if (Properties.Settings.Default.LogPath != "") { txtLogPath.Text = Properties.Settings.Default.LogPath; Directory.CreateDirectory(Path.GetDirectoryName(txtLogPath.Text)); } if (Properties.Settings.Default.URLPath != "") { txtURL.Text = Properties.Settings.Default.URLPath; } if (Properties.Settings.Default.TrimTitle != "") { txtTrimTitle.Text = Properties.Settings.Default.TrimTitle; sTrimTitle = Properties.Settings.Default.TrimTitle; } } private void btnLogPathBrowse_Click(object sender, EventArgs e) { DialogResult result = folderBrowserDialog1.ShowDialog(); if (result == DialogResult.OK) { txtLogPath.Text = folderBrowserDialog1.SelectedPath + "ErrorLog.txt"; Properties.Settings.Default.LogPath = txtLogPath.Text; Properties.Settings.Default.Save(); //string[] files = Directory.GetFiles(folderBrowserDialog1.SelectedPath); //MessageBox.Show("Files found: " + files.Length.ToString(), "Message"); } } private void btnXMLPathBrowse_Click(object sender, EventArgs e) { DialogResult result = folderBrowserDialog1.ShowDialog(); if (result == DialogResult.OK) { txtXMLPath.Text = folderBrowserDialog1.SelectedPath + "WebCrawlerLog.xml"; Properties.Settings.Default.XMLPath = txtXMLPath.Text; Properties.Settings.Default.Save(); //string[] files = Directory.GetFiles(folderBrowserDialog1.SelectedPath); //MessageBox.Show("Files found: " + files.Length.ToString(), "Message"); } } private void btnRunCrawler_Click(object sender, EventArgs e) { lvResults.Clear(); Properties.Settings.Default.URLPath = txtURL.Text; Properties.Settings.Default.TrimTitle = txtTrimTitle.Text; Properties.Settings.Default.Save(); sSourceURL = Properties.Settings.Default.URLPath.ToString(); sTrimTitle = txtTrimTitle.Text; // string fPath = txtXMLPath.Text; //crawl and retrieve links // WebClient wc = new WebClient(); // string sURL = ""; if (txtURL.Text.Contains("http://") | txtURL.Text.Contains("https://")) { sURL = txtURL.Text; } else { sURL = "http://" + txtURL.Text; } // string htmlString = wc.DownloadString(sURL); // using (StreamWriter w = File.AppendText(fPath)) { DirLog.Log(sURL, w); } // LinkFinder.Find(htmlString); lvResults.View = View.Details; // Add columns lvResults.Columns.Add("URL", -2, HorizontalAlignment.Left); lvResults.Columns.Add("Type", -2, HorizontalAlignment.Left); lvResults.Columns.Add("Extension", -2, HorizontalAlignment.Left); lvResults.Columns.Add("Title", -2, HorizontalAlignment.Left); string[] sResult; string sURL1; string sTitle; string sType; string sExtension; foreach (LinkItem i in LinkFinder.Find(htmlString)) { ListViewItem lvi = new ListViewItem(); sResult = i.ToString().Split('t'); sURL1 = sResult[0].Trim(); sTitle = sResult[1].Trim(); sType = sResult[2].Trim(); sExtension = sResult[3].Trim(); //MessageBox.Show(sURL); lvi.Text = sURL1; lvi.SubItems.Add(sType); lvi.SubItems.Add(sExtension); lvi.SubItems.Add(sTitle); lvResults.Items.Add(lvi); } lvResults.AutoResizeColumn(0, ColumnHeaderAutoResizeStyle.HeaderSize); lvResults.AutoResizeColumn(1, ColumnHeaderAutoResizeStyle.ColumnContent); lvResults.AutoResizeColumn(2, ColumnHeaderAutoResizeStyle.ColumnContent); lvResults.AutoResizeColumn(3, ColumnHeaderAutoResizeStyle.ColumnContent); lblRowCount.Text = "Row Count: " + lvResults.Items.Count.ToString(); } } public struct LinkItem { public string Href; public string Text; public string Type; public string Extension; public override string ToString() { return Href + "nrt" + Text + "nrt" + Type + "nrt" + Extension; } } static class LinkFinder { public static List<LinkItem> Find(string file) { string fErrorPath = Properties.Settings.Default.LogPath; List<LinkItem> linkList = new List<LinkItem>(); //find the title of the page. Match mTitle = Regex.Match(file, @"<title>(.*?)</title>", RegexOptions.Singleline); //string page = mTitle.Value; //rnt rn // shorten the page title string pageTitle = XMLCleanString(mTitle.Value, 25); //find all matches in file. MatchCollection m1 = Regex.Matches(file, @"(<a.*?>.*?</a>)", RegexOptions.Singleline); //loop through each match. foreach (Match m in m1) { try { string value = m.Groups[1].Value; LinkItem i = new LinkItem(); //get the href attribute. Match m2 = Regex.Match(value, @"href=""(.*?)""", RegexOptions.Singleline); if (Regex.Match(value, "<img.+?src=["'](.+?)["'].+?>", RegexOptions.IgnoreCase).Success) { i.Type = "Image"; } else { i.Type = "Text"; } // if (m2.Success) { i.Href = m2.Groups[1].Value; } // //remove inner tags from text. //string t = Regex.Replace(value, @"s*<.*?>s*", "", RegexOptions.Singleline).Trim(); string t; t = Regex.Replace(value, @"t|n|r|s*<.*?>s*", "", RegexOptions.Singleline).Trim(); //t = value.Trim(); i.Text = t; #region Get Extension From URL //MessageBox.Show(i.Href); //MessageBox.Show(VirtualPathUtility.GetExtension(i.Href).ToString()); string[] sExt; if (VirtualPathUtility.GetExtension(i.Href).ToString() != "") { if (VirtualPathUtility.GetExtension(i.Href).ToString().Contains('?')) { sExt = VirtualPathUtility.GetExtension(i.Href).ToString().Split('?'); i.Extension = sExt[0]; } else { i.Extension = VirtualPathUtility.GetExtension(i.Href).ToString(); } } else { i.Extension = "/"; } #endregion Get Extension From URL //add to list. linkList.Add(i); //write item to xml file. } catch (Exception ex1) { } } // try { // XElement url = new XElement(XMLCleanString(Form1.sSourceURL, 250), from ll in linkList select new XElement("LinkInfo" , new XElement("URL", ll.Href) , new XElement("Type", ll.Type) , new XElement("Extension", ll.Extension) , new XElement("Text", ll.Text) )); url.Save(Properties.Settings.Default.XMLPath); } catch (Exception ex) { //Debug.WriteLine(ex.Message); // string eMsg = ex.ToString() + Environment.NewLine + "pageTitle: " + pageTitle; using (StreamWriter sw = File.AppendText(fErrorPath)) { DirLog.Log(eMsg, sw); } } // return linkList; } // Truncate Page Title public static string XMLCleanString(string source, int nLength) { source = source.Trim(); source = Regex.Replace(source, @"t|n|r|s*<.*?>s*", "", RegexOptions.Singleline).Trim(); source = source.Replace("http://", ""); source = source.Replace("https://", ""); source = source.Replace("/", "_"); //MessageBox.Show(source); //MessageBox.Show(Form1.sTrimTitle); source = source.Replace(Form1.sTrimTitle, ""); //MessageBox.Show(source); source = Regex.Replace(source, @"s+", "_"); source = source.Replace(@"'", ""); if (source.Length > nLength) { source = source.Substring(0, nLength); source = source.Remove(source.Length - 1); } return source; } } class DirLog { public static void Log(string logMessage, TextWriter w) { w.Write("rnLog Entry : "); w.WriteLine("{0} {1}", DateTime.Now.ToLongTimeString(), DateTime.Now.ToLongTimeString()); w.WriteLine(" :"); w.WriteLine(" :{0}", logMessage); w.WriteLine("--------------------------------------------------------------------------------"); } public static void DumpLog(StreamReader r) { string line; while ((line = r.ReadLine()) != null) { Console.WriteLine(line); } } } }
Last Updated on October 26, 2015
You must be logged in to post a comment.