Details

Back To All Notes

Edit Note Delete Note

Scrape Forbes Business News With Selenium

Don't hardcode the url!!

var businessNewsUrl = “https://www.forbes.com/business/”;
// Use Selenium Web Driver for handling click events
var options = new ChromeOptions();
options.AddArgument("--headless");
options.AddArgument("--incognito");
options.AddArgument("--window-size=1920,1080");
options.AddArgument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
using var driver = new ChromeDriver(options);
// Start by navigating to the homepage
await driver.Navigate().GoToUrlAsync(businessNewsUrl);
var articles = new List<ForbesBusinessNews>();
while (true)
{
   var node = driver.FindElement(By.XPath(
   $"//*[@id=\"row-2\"]/div/div/div/div[2]/div[{articleCount}]"));
   if (node is null) break;
   var article = new ForbesBusinessNews()
   {
       Title = node.FindElement(By.XPath("div/div/div[2]/h3/a")).Text,
       Summary = node.FindElement(By.XPath("div/div/div[2]/p/span")).Text,
       Link = node.FindElement(By.XPath("div/div/div[2]/h3/a"))
           .GetAttribute("href"),
       ImageUrl = node.FindElement(By.XPath("div/div/div[1]/a/img"))
           .GetAttribute("src"),
       PublishedTime = node.FindElement(By.XPath("div/div/div[2]/div[1]/span")).Text,
       Author = node.FindElement(By.XPath("div/div/div[2]/div[2]/a")).Text,
       AuthorProfile = node.FindElement(By.XPath("div/div/div[2]/div[2]/a"))
           .GetAttribute("href"),
       AuthorRole = node.FindElement(By.XPath("div/div/div[2]/div[2]/span[2]")).Text
   };
   articles.Add(article);
   articleCount++;
}

Note that this method causes the forbes website to block the IpAddress after a few visits!