## Web Scrapping with Pagination

You were asked to web scrape the url https://venturebeat.com. Applying what we learned so far, this should be straightforward. 

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [None]:
# 1. GET THE HTML CONTENT OF THE MAIN PAGE
def geturlhtml(main_url):
    
    # make HTTP request
    r = requests.get(main_url)
    html_content = r.text

    # if the request went through and we have some text, 
    # convert to beautiful object
    if html_content is not None:
        html_soup = BeautifulSoup(html_content, "html.parser")
    else:
        raise Exception('Error getting data from {}'.format(url))
        
    return html_soup

# 2. FROM THE HTML CONTENT OF THE MAIN PAGE GET THE ARTICLE LINKS

def featuredlinks(main_htmldoc):
    featured = main_htmldoc.find('div', class_='FeaturedArticles')
    return [ i['href'] for i in featured.find_all('a') ]

def getarticlelinks(html_doc):
    
    article_links = []
    links = html_doc.find_all('a', class_='ArticleListing__title-link')
    for i in links:
        article_links.append(i['href'])
#     print(len(links))
    return article_links


# 3. GET THE HTML CONTENT FOR EACH ARTICLE LINK AND GET THE ARTICLE TITLE AND TEXT
def gettextfromarticleurl(article_url):
    
    # new requests to individual article pages
    r = requests.get(article_url)
    html_content = r.text
    
    # convert to beautiful soup object
    if html_content is not None:
        html_soup = BeautifulSoup(html_content, "html.parser")
    else:
        raise Exception('Error getting data from {}'.format(url))
    
#     # grab the category
#     cat_class = 'Label Label--single Label--brand Label__link--brand'
#     article_category = html_soup.find(class_=cat_class).text.strip()
                                      
    # grab the title
    article_title = html_soup.find('h1', class_='article-title').text
    
    # grab the body
    articlecontent = html_soup.find(class_= 'article-content')
    article_text = []
    for i in articlecontent.find_all('p', recursive=False):
        article_text.append(i.text.strip())
    article_text = " ".join(article_text)
    
    return article_title, article_text 

In [None]:
url = 'https://venturebeat.com'

# 1. GET THE HTML CONTENT OF THE MAIN PAGE
html_soup = geturlhtml(url)

# 2. FROM THE HTML CONTENT OF THE MAIN PAGE GET THE ARTICLE LINKS
article_links = featuredlinks(html_soup) + getarticlelinks(html_soup)

# 3. GET THE HTML CONTENT FOR EACH ARTICLE LINK AND GET THE ARTICLE TITLE AND TEXT
data = [gettextfromarticleurl(i) for i in article_links]
article_titles = [i for (i,j) in data]
article_texts = [j for (i,j) in data]
for i in article_titles:
    print(i)

In [None]:
print(len(article_titles))
print(len(article_texts))

### A. Pagination

Pagination is a technique in webdesigning that splits content into various pages, thus presenting large datasets in digestible manner for web users. There are many pagination methods:
- numbered pagination
- infinite scrolling
- next button
- load more buttons, etc. 

While pagination makes web browsing experience better, it certainly makes the task of web scrapping more difficult. 

Let's see an example now. The webpage we are looking to scrape is https://venturebeat.com. When you scroll to the bottom of the page, you will notice that at some point the url changes to https://venturebeat.com/page/2/ and this pattern continues. 

So, lets repeat what we did above for the url page 2 and see if we get new article links

In [None]:
# 1. GET THE LIST OF WEBPAGES TO SCRAPE
web_urls = ['https://venturebeat.com', 'https://venturebeat.com/page/2']

# 2. FOR EACH WEBPAGE GET THE ARTICLE LINKS
n_urls = len(web_urls)
all_urls = []

for i in range(0,n_urls):
    html_soup = geturlhtml(web_urls[i])
    if i == 0:
        all_urls.extend( featuredlinks(html_soup) + getarticlelinks(html_soup) )
    else:
        all_urls.extend( getarticlelinks(html_soup) )
print(f'There are {len(all_urls)} article urls to retrieve.')

# 3. FOR EACH ARTICLE LINK GET THE HTML CONTENT - TITLE AND TEXT
data = [gettextfromarticleurl(i) for i in all_urls]
article_titles = [i for (i,j) in data]
article_texts = [j for (i,j) in data]

print(f'Text retrieved for {len(article_texts)} articles')

Now, we can expand the webpages urls easily, simply collect a number of webpages using page numbers.

In [None]:
# 1. GET THE LIST OF WEBPAGES
web_urls = ['https://venturebeat.com']

page_no = range(2,15,1)
for i in page_no:
    web_urls.append('https://venturebeat.com/page/'+str(i))
print(web_urls)

Repeat above. 

In [None]:
# 2. FOR EACH WEBPAGE GET THE ARTICLE LINKS
n_urls = len(web_urls)
all_urls = []

for i in range(0,n_urls):
    html_soup = geturlhtml(web_urls[i])
    if i == 0:
        all_urls.extend( featuredlinks(html_soup) + getarticlelinks(html_soup) )
    else:
        all_urls.extend( getarticlelinks(html_soup) )
print(f'There are {len(all_urls)} article urls to retrieve.')

# 3. FOR EACH ARTICLE LINK GET THE HTML CONTENT - TITLE AND TEXT
data = [gettextfromarticleurl(i) for i in all_urls]
article_titles = [i for (i,j) in data]
article_texts = [j for (i,j) in data]

print(f'Text retrieved for {len(article_texts)} articles')

Convert to desired data type.

In [8]:
# 4. CONVERT DATA TO DICTIONARY TO DATAFRAME

data_dictionary = {'url':all_urls, 'title':article_titles, 'text':article_texts}
df = pd.DataFrame.from_dict(data_dictionary)
print(df.shape)
df.head()

(563, 3)


Unnamed: 0,url,title,text
0,https://venturebeat.com/2021/04/27/how-merck-w...,How Merck works with Seeqc to cut through quan...,When it comes to grappling with the future of ...
1,https://venturebeat.com/2021/04/27/amazon-make...,Amazon releases DeepRacer software in open source,"In November 2018, Amazon launched AWS DeepRace..."
2,https://venturebeat.com/2021/04/27/campfire-ra...,Campfire raises $8 million to advance AR/VR fo...,Campfire has raised $8 million in funding for ...
3,https://venturebeat.com/2021/04/28/cloudcheckr...,CloudCheckr survey says cloud computing adopti...,"Cloud transformation is moving quickly, accord..."
4,https://venturebeat.com/2021/04/28/atlassians-...,Atlassian’s Jira Work Management encourages te...,"At its Team21 conference today, Atlassian unve..."


In [None]:
# # IF YOU WANT TO TRACK AS YOU COLLECT DATA FOR EACH ARTICLE

# data_dictionary = {'url':[], 'title':[], 'text':[]}
# tracker = 0
# for i in article_links:
#     title, text = gettextfromarticleurl(i)
#     data_dictionary['url'].append(i)
#     data_dictionary['title'].append(title)
#     data_dictionary['text'].append(text)
#     tracker += 1
#     print('processed ', tracker, ' files')
    
# df = pd.DataFrame.from_dict(data_dictionary)
# print(df.shape)
# df.head()

In [None]:
df.iloc[0,0]

In [None]:
df.iloc[0,1]

In [None]:
df.iloc[0,2]

In [None]:
df.info()

In [None]:
df.to_csv('venturebeat.csv', index=False)