<? xml version = "1.0" ?> < configuration > < configSections > < sectionGroup name = "userSettings" type = "System.Configuration.UserSettingsGroup, System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" > < section name = "WebCrawler.Properties.Settings" type = "System.Configuration.ClientSettingsSection, System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" allowExeDefinition = "MachineToLocalUser" requirePermission = "false" /> </ sectionGroup > </ configSections > < startup > < supportedRuntime version = "v4.0" sku = ".NETFramework,Version=v4.0" /> </ startup > < userSettings > < WebCrawler.Properties.Settings > < setting name = "LogPath" serializeAs = "String" > < value >C:tempWebCrawlerErrorLog.txt</ value > </ setting > < setting name = "XMLPath" serializeAs = "String" > < value >C:tempWebCrawlerWebCrawlerLog.xml</ value > </ setting > < setting name = "URLPath" serializeAs = "String" > < value /> </ setting > < setting name = "TrimTitle" serializeAs = "String" > < value /> </ setting > </ WebCrawler.Properties.Settings > </ userSettings > </ configuration > |
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Text; using System.Windows.Forms; using System.IO; using System.Linq; using System.Text.RegularExpressions; using System.Xml.Linq; using System.Diagnostics; using System.Net; using System.Web; namespace WebCrawler { public partial class Form1 : Form { public static string sTrimTitle = "" ; public static string sSourceURL = "" ; public Form1() { InitializeComponent(); } private void Form1_Load( object sender, EventArgs e) { if (Properties.Settings.Default.XMLPath != "" ) { txtXMLPath.Text = Properties.Settings.Default.XMLPath; Directory.CreateDirectory(Path.GetDirectoryName(txtXMLPath.Text)); } if (Properties.Settings.Default.LogPath != "" ) { txtLogPath.Text = Properties.Settings.Default.LogPath; Directory.CreateDirectory(Path.GetDirectoryName(txtLogPath.Text)); } if (Properties.Settings.Default.URLPath != "" ) { txtURL.Text = Properties.Settings.Default.URLPath; } if (Properties.Settings.Default.TrimTitle != "" ) { txtTrimTitle.Text = Properties.Settings.Default.TrimTitle; sTrimTitle = Properties.Settings.Default.TrimTitle; } } private void btnLogPathBrowse_Click( object sender, EventArgs e) { DialogResult result = folderBrowserDialog1.ShowDialog(); if (result == DialogResult.OK) { txtLogPath.Text = folderBrowserDialog1.SelectedPath + "ErrorLog.txt" ; Properties.Settings.Default.LogPath = txtLogPath.Text; Properties.Settings.Default.Save(); //string[] files = Directory.GetFiles(folderBrowserDialog1.SelectedPath); //MessageBox.Show("Files found: " + files.Length.ToString(), "Message"); } } private void btnXMLPathBrowse_Click( object sender, EventArgs e) { DialogResult result = folderBrowserDialog1.ShowDialog(); if (result == DialogResult.OK) { txtXMLPath.Text = folderBrowserDialog1.SelectedPath + "WebCrawlerLog.xml" ; Properties.Settings.Default.XMLPath = txtXMLPath.Text; Properties.Settings.Default.Save(); //string[] files = Directory.GetFiles(folderBrowserDialog1.SelectedPath); //MessageBox.Show("Files found: " + files.Length.ToString(), "Message"); } } private void btnRunCrawler_Click( object sender, EventArgs e) { lvResults.Clear(); Properties.Settings.Default.URLPath = txtURL.Text; Properties.Settings.Default.TrimTitle = txtTrimTitle.Text; Properties.Settings.Default.Save(); sSourceURL = Properties.Settings.Default.URLPath.ToString(); sTrimTitle = txtTrimTitle.Text; // string fPath = txtXMLPath.Text; //crawl and retrieve links // WebClient wc = new WebClient(); // string sURL = "" ; if (txtURL.Text.Contains( "http://" ) | txtURL.Text.Contains( "https://" )) { sURL = txtURL.Text; } else { sURL = "http://" + txtURL.Text; } // string htmlString = wc.DownloadString(sURL); // using (StreamWriter w = File.AppendText(fPath)) { DirLog.Log(sURL, w); } // LinkFinder.Find(htmlString); lvResults.View = View.Details; // Add columns lvResults.Columns.Add( "URL" , -2, HorizontalAlignment.Left); lvResults.Columns.Add( "Type" , -2, HorizontalAlignment.Left); lvResults.Columns.Add( "Extension" , -2, HorizontalAlignment.Left); lvResults.Columns.Add( "Title" , -2, HorizontalAlignment.Left); string [] sResult; string sURL1; string sTitle; string sType; string sExtension; foreach (LinkItem i in LinkFinder.Find(htmlString)) { ListViewItem lvi = new ListViewItem(); sResult = i.ToString().Split( 't' ); sURL1 = sResult[0].Trim(); sTitle = sResult[1].Trim(); sType = sResult[2].Trim(); sExtension = sResult[3].Trim(); //MessageBox.Show(sURL); lvi.Text = sURL1; lvi.SubItems.Add(sType); lvi.SubItems.Add(sExtension); lvi.SubItems.Add(sTitle); lvResults.Items.Add(lvi); } lvResults.AutoResizeColumn(0, ColumnHeaderAutoResizeStyle.HeaderSize); lvResults.AutoResizeColumn(1, ColumnHeaderAutoResizeStyle.ColumnContent); lvResults.AutoResizeColumn(2, ColumnHeaderAutoResizeStyle.ColumnContent); lvResults.AutoResizeColumn(3, ColumnHeaderAutoResizeStyle.ColumnContent); lblRowCount.Text = "Row Count: " + lvResults.Items.Count.ToString(); } } public struct LinkItem { public string Href; public string Text; public string Type; public string Extension; public override string ToString() { return Href + "nrt" + Text + "nrt" + Type + "nrt" + Extension; } } static class LinkFinder { public static List<LinkItem> Find( string file) { string fErrorPath = Properties.Settings.Default.LogPath; List<LinkItem> linkList = new List<LinkItem>(); //find the title of the page. Match mTitle = Regex.Match(file, @"<title>(.*?)</title>" , RegexOptions.Singleline); //string page = mTitle.Value; //rnt rn // shorten the page title string pageTitle = XMLCleanString(mTitle.Value, 25); //find all matches in file. MatchCollection m1 = Regex.Matches(file, @"(<a.*?>.*?</a>)" , RegexOptions.Singleline); //loop through each match. foreach (Match m in m1) { try { string value = m.Groups[1].Value; LinkItem i = new LinkItem(); //get the href attribute. Match m2 = Regex.Match(value, @"href=""(.*?)""" , RegexOptions.Singleline); if (Regex.Match(value, "<img.+?src=[" '](.+?)["' ].+?>", RegexOptions.IgnoreCase).Success) { i.Type = "Image" ; } else { i.Type = "Text" ; } // if (m2.Success) { i.Href = m2.Groups[1].Value; } // //remove inner tags from text. //string t = Regex.Replace(value, @"s*<.*?>s*", "", RegexOptions.Singleline).Trim(); string t; t = Regex.Replace(value, @"t|n|r|s*<.*?>s*" , "" , RegexOptions.Singleline).Trim(); //t = value.Trim(); i.Text = t; #region Get Extension From URL //MessageBox.Show(i.Href); //MessageBox.Show(VirtualPathUtility.GetExtension(i.Href).ToString()); string [] sExt; if (VirtualPathUtility.GetExtension(i.Href).ToString() != "" ) { if (VirtualPathUtility.GetExtension(i.Href).ToString().Contains( '?' )) { sExt = VirtualPathUtility.GetExtension(i.Href).ToString().Split( '?' ); i.Extension = sExt[0]; } else { i.Extension = VirtualPathUtility.GetExtension(i.Href).ToString(); } } else { i.Extension = "/" ; } #endregion Get Extension From URL //add to list. linkList.Add(i); //write item to xml file. } catch (Exception ex1) { } } // try { // XElement url = new XElement(XMLCleanString(Form1.sSourceURL, 250), from ll in linkList select new XElement( "LinkInfo" , new XElement( "URL" , ll.Href) , new XElement( "Type" , ll.Type) , new XElement( "Extension" , ll.Extension) , new XElement( "Text" , ll.Text) )); url.Save(Properties.Settings.Default.XMLPath); } catch (Exception ex) { //Debug.WriteLine(ex.Message); // string eMsg = ex.ToString() + Environment.NewLine + "pageTitle: " + pageTitle; using (StreamWriter sw = File.AppendText(fErrorPath)) { DirLog.Log(eMsg, sw); } } // return linkList; } // Truncate Page Title public static string XMLCleanString( string source, int nLength) { source = source.Trim(); source = Regex.Replace(source, @"t|n|r|s*<.*?>s*" , "" , RegexOptions.Singleline).Trim(); source = source.Replace( "http://" , "" ); source = source.Replace( "https://" , "" ); source = source.Replace( "/" , "_" ); //MessageBox.Show(source); //MessageBox.Show(Form1.sTrimTitle); source = source.Replace(Form1.sTrimTitle, "" ); //MessageBox.Show(source); source = Regex.Replace(source, @"s+" , "_" ); source = source.Replace( @"'" , "" ); if (source.Length > nLength) { source = source.Substring(0, nLength); source = source.Remove(source.Length - 1); } return source; } } class DirLog { public static void Log( string logMessage, TextWriter w) { w.Write( "rnLog Entry : " ); w.WriteLine( "{0} {1}" , DateTime.Now.ToLongTimeString(), DateTime.Now.ToLongTimeString()); w.WriteLine( " :" ); w.WriteLine( " :{0}" , logMessage); w.WriteLine( "--------------------------------------------------------------------------------" ); } public static void DumpLog(StreamReader r) { string line; while ((line = r.ReadLine()) != null ) { Console.WriteLine(line); } } } } |
Last Updated on October 26, 2015
You must be logged in to post a comment.