the problem I am facing is this news scraper is not working and I can't figure out why. If it is possible can someone help me and amend the code. Thank you very much. I have put the code and the error that occurs when I run it. I have tried many possible solutions I just can't figure it out it shows where the errors are however I can't figure out what to change them to.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import numpy as np
from datetime import datetime
page = requests.get('https://qz.com/africa/latest')
soup = BeautifulSoup(page.content, 'html.parser')
weblinks = soup.find_all('article')
pagelinks = []
for link in weblinks[5:]:
url = link.contents[0].find_all('a')[0]
pagelinks.append('http://qz.com'+url.get('href'))
authorname = []
title = []
thearticle = []
for link in pagelinks:
# store the text for each article
paragraphtext = []
# get url
url = link
# get page text
page = requests.get(url)
# parse with BFS
soup = BeautifulSoup(page.text, 'html.parser')
# get author name, if there's a named author
try:
abody = soup.find(class_='d3284 africa').find('a')
aname = abody.get_text()
except:
aname = 'Anonymous'
# get article title
atitle = soup.find(class_="_21349 africa none _4ca8e")
thetitle = atitle.get_text()
# get main article page
articlebody = soup.find(class_='_61c55')
# get text
articletext = soup.find_all('p')[8:]
# print text
for paragraph in articletext[:-1]:
# get the text only
text = paragraph.get_text()
paragraphtext.append(text)
# combine all paragraphs into an article
thearticle.append(paragraphtext)
authorname.append(aname)
title.append(thetitle)
# join paragraphs to re-create the article
myarticle = [' '.join(article) for article in thearticle]
# save article data to file
data = {'Title':title,
'Author':authorname,
'PageLink':pagelinks,
'Article':myarticle,
'Date':datetime.now()}
oldnews = pd.read_excel('quartz\\news.xlsx')
news = pd.DataFrame(data=data)
cols = ['Title', 'Author', 'PageLink', 'Article', 'Date']
news = news[cols]
afronews = oldnews.append(news)
afronews.drop_duplicates(subset='Title', keep='last', inplace=True)
afronews.reset_index(inplace=True)
afronews.drop(labels='index', axis=1, inplace=True)
filename = 'quartz\\news.xlsx'
wks_name = 'Data'
writer = pd.ExcelWriter(filename)
afronews.to_excel(writer, wks_name, index=False)
writer.save()
The error that comes out is:
FileNotFoundError Traceback (most recent call last)
<ipython-input-3-dddb080986df> in <module>()
65 'Date':datetime.now()}
66
---> 67 oldnews = pd.read_excel('quartz\\news.xlsx')
68 news = pd.DataFrame(data=data)
69 cols = ['Title', 'Author', 'PageLink', 'Article', 'Date']
~/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
176 else:
177 kwargs[new_arg_name] = new_arg_value
--> 178 return func(*args, **kwargs)
179 return wrapper
180 return _deprecate_kwarg
~/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
176 else:
177 kwargs[new_arg_name] = new_arg_value
--> 178 return func(*args, **kwargs)
179 return wrapper
180 return _deprecate_kwarg
~/anaconda3/lib/python3.7/site-packages/pandas/io/excel.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, **kwds)
305
306 if not isinstance(io, ExcelFile):
--> 307 io = ExcelFile(io, engine=engine)
308
309 return io.parse(
~/anaconda3/lib/python3.7/site-packages/pandas/io/excel.py in __init__(self, io, **kwds)
392 self.book = xlrd.open_workbook(file_contents=data)
393 elif isinstance(self._io, compat.string_types):
--> 394 self.book = xlrd.open_workbook(self._io)
395 else:
396 raise ValueError('Must explicitly set engine if not passing in'
~/anaconda3/lib/python3.7/site-packages/xlrd/__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
114 peek = file_contents[:peeksz]
115 else:
--> 116 with open(filename, "rb") as f:
117 peek = f.read(peeksz)
118 if peek == b"PK\x03\x04": # a ZIP file
FileNotFoundError: [Errno 2] No such file or directory: 'quartz\\news.xlsx'
Comments
Post a Comment