Details
Scrape Forbes Business News With Selenium
Don't hardcode the url!!
var businessNewsUrl = “https://www.forbes.com/business/”;
// Use Selenium Web Driver for handling click events
var options = new ChromeOptions();
options.AddArgument("--headless");
options.AddArgument("--incognito");
options.AddArgument("--window-size=1920,1080");
options.AddArgument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
using var driver = new ChromeDriver(options);
// Start by navigating to the homepage
await driver.Navigate().GoToUrlAsync(businessNewsUrl);
var articles = new List<ForbesBusinessNews>();
while (true)
{
var node = driver.FindElement(By.XPath(
$"//*[@id=\"row-2\"]/div/div/div/div[2]/div[{articleCount}]"));
if (node is null) break;
var article = new ForbesBusinessNews()
{
Title = node.FindElement(By.XPath("div/div/div[2]/h3/a")).Text,
Summary = node.FindElement(By.XPath("div/div/div[2]/p/span")).Text,
Link = node.FindElement(By.XPath("div/div/div[2]/h3/a"))
.GetAttribute("href"),
ImageUrl = node.FindElement(By.XPath("div/div/div[1]/a/img"))
.GetAttribute("src"),
PublishedTime = node.FindElement(By.XPath("div/div/div[2]/div[1]/span")).Text,
Author = node.FindElement(By.XPath("div/div/div[2]/div[2]/a")).Text,
AuthorProfile = node.FindElement(By.XPath("div/div/div[2]/div[2]/a"))
.GetAttribute("href"),
AuthorRole = node.FindElement(By.XPath("div/div/div[2]/div[2]/span[2]")).Text
};
articles.Add(article);
articleCount++;
}
Note that this method causes the forbes website to block the IpAddress after a few visits!