{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introducing Python Workshop #\n", "### Session I - Web scraping is easy with Python ###" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "\n", "# request data from website and store in a variable\n", "website_url = requests.get(\"https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue\")\n", "website_content = website_url.text\n", "\n", "# convert data to soup object for easy html parsing\n", "soup = BeautifulSoup(website_content, \"lxml\")\n", "my_table = soup.find( 'table', {'class':'wikitable sortable'} )\n", "\n", "# parse the table and convert to Python dictionary\n", "mytable_dict = { 'Rank':[], 'Name':[], 'Industry':[], 'Revenue':[], 'Revenue_Growth':[], 'Employees':[], 'Country':[] }\n", "\n", "mytable_header_list = ['Rank', 'Name', 'Industry', 'Revenue', 'Employees', 'Revenue_Growth']\n", "\n", "for eachrow in my_table('tr')[1:]:\n", " cells = eachrow(['th', 'td'])\n", " \n", " for idx, header in enumerate(mytable_header_list):\n", " mytable_dict[header].append( cells[idx].text.strip() )\n", " \n", " mytable_dict['Country'].append( cells[6].select('.flagicon > a')[0].get('title') )\n", " \n", "df = pd.DataFrame(mytable_dict)\n", "df.to_csv('companies_by_revenue.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Rank | \n", "Name | \n", "Industry | \n", "Revenue | \n", "Revenue_Growth | \n", "Employees | \n", "Country | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "Walmart | \n", "Retail | \n", "$500,343 | \n", "2,300,000 | \n", "3.0% | \n", "United States | \n", "
1 | \n", "2 | \n", "State Grid | \n", "Electricity | \n", "$348,903 | \n", "913,546 | \n", "10.7% | \n", "China | \n", "
2 | \n", "3 | \n", "Sinopec | \n", "Oil and gas | \n", "$326,953 | \n", "667,793 | \n", "22.2% | \n", "China | \n", "
3 | \n", "4 | \n", "China National Petroleum | \n", "Oil and gas | \n", "$326,008 | \n", "1,636,532 | \n", "24.2% | \n", "China | \n", "
4 | \n", "5 | \n", "Royal Dutch Shell | \n", "Oil and gas | \n", "$311,870 | \n", "84,000 | \n", "29.9% | \n", "Netherlands | \n", "
5 | \n", "6 | \n", "Toyota | \n", "Automotive | \n", "$265,172 | \n", "369,124 | \n", "4.1% | \n", "Japan | \n", "
6 | \n", "7 | \n", "Volkswagen | \n", "Automotive | \n", "$260,028 | \n", "642,292 | \n", "8.2% | \n", "Germany | \n", "
7 | \n", "8 | \n", "BP | \n", "Oil and gas | \n", "$244,582 | \n", "74,000 | \n", "31.1% | \n", "United Kingdom | \n", "
8 | \n", "9 | \n", "ExxonMobil | \n", "Oil and gas | \n", "$244,363 | \n", "71,200 | \n", "17.4% | \n", "United States | \n", "
9 | \n", "10 | \n", "Berkshire Hathaway | \n", "Conglomerate | \n", "$242,137 | \n", "377,000 | \n", "8.3% | \n", "United States | \n", "
10 | \n", "11 | \n", "Apple | \n", "Electronics | \n", "$229,234 | \n", "123,000 | \n", "6.3% | \n", "United States | \n", "
11 | \n", "12 | \n", "Samsung | \n", "Electronics | \n", "$211,940 | \n", "320,671 | \n", "21.8% | \n", "South Korea | \n", "
12 | \n", "13 | \n", "McKesson | \n", "Healthcare | \n", "$208,357 | \n", "68,000 | \n", "4.9% | \n", "United States | \n", "
13 | \n", "14 | \n", "Glencore | \n", "MiningCommodities | \n", "$205,476 | \n", "82,681 | \n", "18.2% | \n", "Switzerland | \n", "
14 | \n", "15 | \n", "UnitedHealth | \n", "Healthcare | \n", "$201,159 | \n", "260,000 | \n", "9.3% | \n", "United States | \n", "
15 | \n", "16 | \n", "Daimler | \n", "Automotive | \n", "$185,235 | \n", "289,321 | \n", "9.3% | \n", "Germany | \n", "
16 | \n", "17 | \n", "CVS Health | \n", "Healthcare | \n", "$184,765 | \n", "203,000 | \n", "4.1% | \n", "United States | \n", "
17 | \n", "18 | \n", "Amazon | \n", "Retail | \n", "$177,866 | \n", "566,000 | \n", "30.8% | \n", "United States | \n", "
18 | \n", "19 | \n", "Exor | \n", "Financials | \n", "$161,677 | \n", "307,637 | \n", "4.4% | \n", "Italy | \n", "
19 | \n", "20 | \n", "AT&T | \n", "Telecommunications | \n", "$160,546 | \n", "254,000 | \n", "2.0% | \n", "United States | \n", "
20 | \n", "21 | \n", "General Motors | \n", "Automotive | \n", "$157,311 | \n", "180,000 | \n", "5.5% | \n", "United States | \n", "
21 | \n", "22 | \n", "Ford | \n", "Automotive | \n", "$156,776 | \n", "202,000 | \n", "3.3% | \n", "United States | \n", "
22 | \n", "23 | \n", "China State Construction | \n", "Construction | \n", "$156,071 | \n", "270,467 | \n", "8.0% | \n", "China | \n", "
23 | \n", "24 | \n", "Foxconn | \n", "Electronics | \n", "$154,699 | \n", "803,126 | \n", "14.5% | \n", "Taiwan | \n", "
24 | \n", "25 | \n", "AmerisourceBergen | \n", "Pharmaceuticals | \n", "$153,144 | \n", "19,500 | \n", "4.3% | \n", "United States | \n", "
25 | \n", "26 | \n", "Industrial & Commercial Bank of China | \n", "Financials | \n", "$153,021 | \n", "453,048 | \n", "3.6% | \n", "China | \n", "
26 | \n", "27 | \n", "AXA | \n", "Financials | \n", "$149,461 | \n", "95,728 | \n", "4.0% | \n", "France | \n", "
27 | \n", "28 | \n", "Total | \n", "Oil and gas | \n", "$149,099 | \n", "98,277 | \n", "16.6% | \n", "France | \n", "
28 | \n", "29 | \n", "Ping An Insurance | \n", "Financials | \n", "$144,197 | \n", "342,550 | \n", "23.7% | \n", "China | \n", "
29 | \n", "30 | \n", "Honda | \n", "Automotive | \n", "$138,646 | \n", "215,638 | \n", "7.3% | \n", "Japan | \n", "
30 | \n", "31 | \n", "China Construction Bank | \n", "Financials | \n", "$138,594 | \n", "370,415 | \n", "2.6% | \n", "China | \n", "
31 | \n", "32 | \n", "Trafigura | \n", "Commodities | \n", "$136,421 | \n", "3,935 | \n", "39.1% | \n", "Singapore | \n", "
32 | \n", "33 | \n", "Chevron | \n", "Oil and gas | \n", "$134,533 | \n", "51,900 | \n", "25.1% | \n", "United States | \n", "
33 | \n", "34 | \n", "Cardinal Health | \n", "Pharmaceuticals | \n", "$129,976 | \n", "40,400 | \n", "6.9% | \n", "United States | \n", "
34 | \n", "35 | \n", "Costco | \n", "Retail | \n", "$129,025 | \n", "182,000 | \n", "8.7% | \n", "United States | \n", "
35 | \n", "36 | \n", "SAIC Motor | \n", "Automotive | \n", "$128,819 | \n", "148,767 | \n", "13.1% | \n", "China | \n", "
36 | \n", "37 | \n", "Verizon | \n", "Telecommunications | \n", "$126,034 | \n", "155,400 | \n", "0.0% | \n", "United States | \n", "
37 | \n", "38 | \n", "Allianz | \n", "Financials | \n", "$123,532 | \n", "140,553 | \n", "1.1% | \n", "Germany | \n", "
38 | \n", "39 | \n", "Kroger | \n", "Retail | \n", "$122,662 | \n", "449,000 | \n", "6.4% | \n", "United States | \n", "
39 | \n", "40 | \n", "Agricultural Bank of China | \n", "Financials | \n", "$122,366 | \n", "491,578 | \n", "4.3% | \n", "China | \n", "
40 | \n", "41 | \n", "General Electric | \n", "Conglomerate | \n", "$122,274 | \n", "313,000 | \n", "3.5% | \n", "United States | \n", "
41 | \n", "42 | \n", "China Life Insurance | \n", "Financials | \n", "$120,224 | \n", "170,517 | \n", "14.7% | \n", "China | \n", "
42 | \n", "43 | \n", "Walgreens Boots Alliance | \n", "RetailPharmaceuticals | \n", "$118,214 | \n", "290,000 | \n", "0.7% | \n", "United States | \n", "
43 | \n", "44 | \n", "BNP Paribas | \n", "Financials | \n", "$117,375 | \n", "189,509 | \n", "7.7% | \n", "France | \n", "
44 | \n", "45 | \n", "Japan Post Holdings | \n", "Conglomerate | \n", "$116,616 | \n", "245,863 | \n", "5.2% | \n", "Japan | \n", "
45 | \n", "46 | \n", "Bank of China | \n", "Financials | \n", "$115,423 | \n", "311,133 | \n", "1.5% | \n", "China | \n", "
46 | \n", "47 | \n", "JPMorgan Chase | \n", "Financials | \n", "$113,899 | \n", "252,539 | \n", "8.0% | \n", "United States | \n", "
47 | \n", "48 | \n", "Fannie Mae | \n", "Financials | \n", "$112,394 | \n", "7,200 | \n", "4.9% | \n", "United States | \n", "
48 | \n", "49 | \n", "Gazprom | \n", "Oil and gas | \n", "$111,983 | \n", "469,600 | \n", "22.5% | \n", "Russia | \n", "
49 | \n", "50 | \n", "Prudential | \n", "Financials | \n", "$111,458 | \n", "24,711 | \n", "14.9% | \n", "United Kingdom | \n", "