|
dev
newsgroups
|
|||||||||||||||||||||||
|
|||||||||||||||||||||||
Trying to scrape a website in C#, but failing. Looking for assistance pleaseI have now spent far too much time on this small problem, I am hopefully going to hand it over to you guys that are cleverer than I in this respect. Problem. I need to get all firms in a certain postal code area (say "E1") from the FSA website. I can do this interactively, but I thought I would save time and automate it. Big mistake, I have taken hours investigating an so far failed to get the second search results page displayed. Step 1. Goto site http://www.fsa.gov.uk/register/firmSearchForm.do and enter postal code of "E1", press "Submit". Step 2. At bottom of screen, you can select 21 further pages via URI such as http://www.fsa.gov.uk/register/firmMainSearch.do?pageNumber=3 for page 3. I though this would be easy to code in VS2005, but I have come to a complete standstill and would like some help please. Full code in a button Click event below. First connection to the site is only to get the cookie, nothing else. Second connection is the entry of postcode "E1" and pressing "Submit" button Third connection is the selection of Page 2 to display companies 11 to 20. You will need to add following to top of basic windforms app. using System.Net; using System.IO; using System.Net.Cache; --------------------------------- private void button2_Click(object sender, EventArgs e) { string address; string postData; CookieContainer cookies = new CookieContainer(); HttpWebRequest webRequest; HttpWebResponse webResponse; StreamReader responseReader; StreamWriter requestWriter; string responseData; Cookie cookie ; // // First connection to site, purely to get the cookie out. // (WORKS) // address = "http://www.fsa.gov.uk/register/firmMainSearch.do"; postData = ""; webRequest = WebRequest.Create(address) as HttpWebRequest; webRequest.Method = "GET"; webRequest.CookieContainer = cookies; webRequest.ContentType = "application/x-www-form-urlencoded"; webResponse = (HttpWebResponse)webRequest.GetResponse(); responseReader = new StreamReader(webResponse.GetResponseStream()); responseData = responseReader.ReadToEnd(); responseReader.Close(); webResponse.Close(); cookie = webResponse.Cookies[0]; // // Second connection to site to emulate sending postcode E1 and pressing Submit button // (WORKS) // postData = "pageNumber=0&firmName=&postcodeOut=E1&postcodeIn=&searchType=1&currAuthorisedInd=on&ddd=Submit"; address = "http://www.fsa.gov.uk/register/firmMainSearch.do;" + cookie.Name + "="+cookie.Value; webRequest = WebRequest.Create(address) as HttpWebRequest; webRequest.Method = "POST"; webRequest.CookieContainer = cookies; webRequest.CookieContainer.Add(cookie); webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*"; byte[] bytes = Encoding.ASCII.GetBytes(postData); webRequest.ContentLength = bytes.Length; webRequest.Referer = "http://www.fsa.gov.uk/register/firmMainSearch.do;";// +cookie.Name + "=" + cookie.Value; webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value); webRequest.Headers.Add("Accept-Encoding", "gzip, deflate"); webRequest.Headers.Add("Accept-Language", "en-us"); webRequest.Headers.Add("UA-CPU", "x86"); webRequest.KeepAlive = true; webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)"; webRequest.ContentType = "application/x-www-form-urlencoded"; webRequest.CachePolicy = new HttpRequestCachePolicy(HttpRequestCacheLevel.NoCacheNoStore); requestWriter = new StreamWriter(webRequest.GetRequestStream()); requestWriter.Write(postData); requestWriter.Close(); webResponse = (HttpWebResponse)webRequest.GetResponse(); responseReader = new StreamReader(webResponse.GetResponseStream()); responseData = responseReader.ReadToEnd(); responseReader.Close(); webResponse.Close(); // // Third connection, attempting to go to the second screen to read off the subsequent companies // (DOES NOT WORK) // string Secret_address = "http://www.fsa.gov.uk/register/firmMainSearch.do?pageNumber=2"; webRequest = WebRequest.Create(Secret_address) as HttpWebRequest; webRequest.Method = "GET"; webRequest.CookieContainer = cookies; webRequest.ContentType = "application/x-www-form-urlencoded"; webRequest.CookieContainer.Add(cookie); webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*"; webRequest.Referer = "http://www.fsa.gov.uk/register/firmMainSearch.do;" + cookie.Name + "=" + cookie.Value; webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value); webRequest.Headers.Add("Accept-Encoding", "gzip, deflate"); webRequest.Headers.Add("Accept-Language", "en-us"); webRequest.Headers.Add("UA-CPU", "x86"); webRequest.KeepAlive = true; webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)"; webResponse = (HttpWebResponse)webRequest.GetResponse(); responseReader = new StreamReader(webResponse.GetResponseStream()); responseData = responseReader.ReadToEnd(); responseReader.Close(); webResponse.Close(); webBrowser1.DocumentText = responseData; } Whats the actuall error you get?
Ciaran Show quote "Admin.Tal***@googlemail.com" wrote: > Hi people, > > I have now spent far too much time on this small problem, I am > hopefully going to hand it over to you guys that are cleverer than I in > this respect. > > Problem. I need to get all firms in a certain postal code area (say > "E1") from the FSA website. I can do this interactively, but I > thought I would save time and automate it. Big mistake, I have taken > hours investigating an so far failed to get the second search results > page displayed. > > Step 1. > Goto site http://www.fsa.gov.uk/register/firmSearchForm.do and enter > postal code of "E1", press "Submit". > > Step 2. > At bottom of screen, you can select 21 further pages via URI such as > http://www.fsa.gov.uk/register/firmMainSearch.do?pageNumber=3 for page > 3. > > I though this would be easy to code in VS2005, but I have come to a > complete standstill and would like some help please. > > Full code in a button Click event below. > > First connection to the site is only to get the cookie, nothing else. > Second connection is the entry of postcode "E1" and pressing "Submit" > button > Third connection is the selection of Page 2 to display companies 11 to > 20. > > You will need to add following to top of basic windforms app. > using System.Net; > using System.IO; > using System.Net.Cache; > --------------------------------- > private void button2_Click(object sender, EventArgs e) > { > string address; > string postData; > CookieContainer cookies = new CookieContainer(); > HttpWebRequest webRequest; > HttpWebResponse webResponse; > StreamReader responseReader; > StreamWriter requestWriter; > string responseData; > Cookie cookie ; > > // > // First connection to site, purely to get the cookie out. > // (WORKS) > // > address = "http://www.fsa.gov.uk/register/firmMainSearch.do"; > postData = ""; > > webRequest = WebRequest.Create(address) as HttpWebRequest; > webRequest.Method = "GET"; > webRequest.CookieContainer = cookies; > webRequest.ContentType = "application/x-www-form-urlencoded"; > webResponse = (HttpWebResponse)webRequest.GetResponse(); > responseReader = new StreamReader(webResponse.GetResponseStream()); > responseData = responseReader.ReadToEnd(); > responseReader.Close(); > webResponse.Close(); > > cookie = webResponse.Cookies[0]; > > // > // Second connection to site to emulate sending postcode E1 and > pressing Submit button > // (WORKS) > // > postData = > "pageNumber=0&firmName=&postcodeOut=E1&postcodeIn=&searchType=1&currAuthorisedInd=on&ddd=Submit"; > address = "http://www.fsa.gov.uk/register/firmMainSearch.do;" + > cookie.Name + "="+cookie.Value; > webRequest = WebRequest.Create(address) as HttpWebRequest; > webRequest.Method = "POST"; > webRequest.CookieContainer = cookies; > webRequest.CookieContainer.Add(cookie); > webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, > image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, > application/vnd.ms-powerpoint, application/msword, */*"; > byte[] bytes = Encoding.ASCII.GetBytes(postData); > webRequest.ContentLength = bytes.Length; > > webRequest.Referer = > "http://www.fsa.gov.uk/register/firmMainSearch.do;";// +cookie.Name + > "=" + cookie.Value; > webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value); > webRequest.Headers.Add("Accept-Encoding", "gzip, deflate"); > webRequest.Headers.Add("Accept-Language", "en-us"); > webRequest.Headers.Add("UA-CPU", "x86"); > webRequest.KeepAlive = true; > webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows > NT 5.1; .NET CLR 2.0.50727)"; > webRequest.ContentType = "application/x-www-form-urlencoded"; > webRequest.CachePolicy = new > HttpRequestCachePolicy(HttpRequestCacheLevel.NoCacheNoStore); > requestWriter = new StreamWriter(webRequest.GetRequestStream()); > requestWriter.Write(postData); > requestWriter.Close(); > webResponse = (HttpWebResponse)webRequest.GetResponse(); > responseReader = new StreamReader(webResponse.GetResponseStream()); > responseData = responseReader.ReadToEnd(); > responseReader.Close(); > webResponse.Close(); > > > // > // Third connection, attempting to go to the second screen to read > off the subsequent companies > // (DOES NOT WORK) > // > string Secret_address = > "http://www.fsa.gov.uk/register/firmMainSearch.do?pageNumber=2"; > > webRequest = WebRequest.Create(Secret_address) as HttpWebRequest; > webRequest.Method = "GET"; > webRequest.CookieContainer = cookies; > webRequest.ContentType = "application/x-www-form-urlencoded"; > webRequest.CookieContainer.Add(cookie); > webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, > image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, > application/vnd.ms-powerpoint, application/msword, */*"; > > webRequest.Referer = > "http://www.fsa.gov.uk/register/firmMainSearch.do;" + cookie.Name + "=" > + cookie.Value; > webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value); > webRequest.Headers.Add("Accept-Encoding", "gzip, deflate"); > webRequest.Headers.Add("Accept-Language", "en-us"); > webRequest.Headers.Add("UA-CPU", "x86"); > webRequest.KeepAlive = true; > webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows > NT 5.1; .NET CLR 2.0.50727)"; > > webResponse = (HttpWebResponse)webRequest.GetResponse(); > responseReader = new StreamReader(webResponse.GetResponseStream()); > responseData = responseReader.ReadToEnd(); > responseReader.Close(); > webResponse.Close(); > > webBrowser1.DocumentText = responseData; > > } > > Thanks all.
Fixed by using WebBrowser control instead of separate WebRequest / WebResponse objects. Ciaran O''Donnell wrote: Show quote > Whats the actuall error you get? > > Ciaran > > "Admin.Tal***@googlemail.com" wrote: > > > Hi people, > > > > I have now spent far too much time on this small problem, I am > > hopefully going to hand it over to you guys that are cleverer than I in > > this respect. > > > > Problem. I need to get all firms in a certain postal code area (say > > "E1") from the FSA website. I can do this interactively, but I > > thought I would save time and automate it. Big mistake, I have taken > > hours investigating an so far failed to get the second search results > > page displayed. > > > > Step 1. > > Goto site http://www.fsa.gov.uk/register/firmSearchForm.do and enter > > postal code of "E1", press "Submit". > > > > Step 2. > > At bottom of screen, you can select 21 further pages via URI such as > > http://www.fsa.gov.uk/register/firmMainSearch.do?pageNumber=3 for page > > 3. > > > > I though this would be easy to code in VS2005, but I have come to a > > complete standstill and would like some help please. > > > > Full code in a button Click event below. > > > > First connection to the site is only to get the cookie, nothing else. > > Second connection is the entry of postcode "E1" and pressing "Submit" > > button > > Third connection is the selection of Page 2 to display companies 11 to > > 20. > > > > You will need to add following to top of basic windforms app. > > using System.Net; > > using System.IO; > > using System.Net.Cache; > > --------------------------------- > > private void button2_Click(object sender, EventArgs e) > > { > > string address; > > string postData; > > CookieContainer cookies = new CookieContainer(); > > HttpWebRequest webRequest; > > HttpWebResponse webResponse; > > StreamReader responseReader; > > StreamWriter requestWriter; > > string responseData; > > Cookie cookie ; > > > > // > > // First connection to site, purely to get the cookie out. > > // (WORKS) > > // > > address = "http://www.fsa.gov.uk/register/firmMainSearch.do"; > > postData = ""; > > > > webRequest = WebRequest.Create(address) as HttpWebRequest; > > webRequest.Method = "GET"; > > webRequest.CookieContainer = cookies; > > webRequest.ContentType = "application/x-www-form-urlencoded"; > > webResponse = (HttpWebResponse)webRequest.GetResponse(); > > responseReader = new StreamReader(webResponse.GetResponseStream()); > > responseData = responseReader.ReadToEnd(); > > responseReader.Close(); > > webResponse.Close(); > > > > cookie = webResponse.Cookies[0]; > > > > // > > // Second connection to site to emulate sending postcode E1 and > > pressing Submit button > > // (WORKS) > > // > > postData = > > "pageNumber=0&firmName=&postcodeOut=E1&postcodeIn=&searchType=1&currAuthorisedInd=on&ddd=Submit"; > > address = "http://www.fsa.gov.uk/register/firmMainSearch.do;" + > > cookie.Name + "="+cookie.Value; > > webRequest = WebRequest.Create(address) as HttpWebRequest; > > webRequest.Method = "POST"; > > webRequest.CookieContainer = cookies; > > webRequest.CookieContainer.Add(cookie); > > webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, > > image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, > > application/vnd.ms-powerpoint, application/msword, */*"; > > byte[] bytes = Encoding.ASCII.GetBytes(postData); > > webRequest.ContentLength = bytes.Length; > > > > webRequest.Referer = > > "http://www.fsa.gov.uk/register/firmMainSearch.do;";// +cookie.Name + > > "=" + cookie.Value; > > webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value); > > webRequest.Headers.Add("Accept-Encoding", "gzip, deflate"); > > webRequest.Headers.Add("Accept-Language", "en-us"); > > webRequest.Headers.Add("UA-CPU", "x86"); > > webRequest.KeepAlive = true; > > webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows > > NT 5.1; .NET CLR 2.0.50727)"; > > webRequest.ContentType = "application/x-www-form-urlencoded"; > > webRequest.CachePolicy = new > > HttpRequestCachePolicy(HttpRequestCacheLevel.NoCacheNoStore); > > requestWriter = new StreamWriter(webRequest.GetRequestStream()); > > requestWriter.Write(postData); > > requestWriter.Close(); > > webResponse = (HttpWebResponse)webRequest.GetResponse(); > > responseReader = new StreamReader(webResponse.GetResponseStream()); > > responseData = responseReader.ReadToEnd(); > > responseReader.Close(); > > webResponse.Close(); > > > > > > // > > // Third connection, attempting to go to the second screen to read > > off the subsequent companies > > // (DOES NOT WORK) > > // > > string Secret_address = > > "http://www.fsa.gov.uk/register/firmMainSearch.do?pageNumber=2"; > > > > webRequest = WebRequest.Create(Secret_address) as HttpWebRequest; > > webRequest.Method = "GET"; > > webRequest.CookieContainer = cookies; > > webRequest.ContentType = "application/x-www-form-urlencoded"; > > webRequest.CookieContainer.Add(cookie); > > webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, > > image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, > > application/vnd.ms-powerpoint, application/msword, */*"; > > > > webRequest.Referer = > > "http://www.fsa.gov.uk/register/firmMainSearch.do;" + cookie.Name + "=" > > + cookie.Value; > > webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value); > > webRequest.Headers.Add("Accept-Encoding", "gzip, deflate"); > > webRequest.Headers.Add("Accept-Language", "en-us"); > > webRequest.Headers.Add("UA-CPU", "x86"); > > webRequest.KeepAlive = true; > > webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows > > NT 5.1; .NET CLR 2.0.50727)"; > > > > webResponse = (HttpWebResponse)webRequest.GetResponse(); > > responseReader = new StreamReader(webResponse.GetResponseStream()); > > responseData = responseReader.ReadToEnd(); > > responseReader.Close(); > > webResponse.Close(); > > > > webBrowser1.DocumentText = responseData; > > > > } > > > > |
|||||||||||||||||||||||