{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introducing Python Workshop #\n", "### Session I - Web scraping is easy with Python ###" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "\n", "# request data from website and store in a variable\n", "website_url = requests.get(\"https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue\")\n", "website_content = website_url.text\n", "\n", "# convert data to soup object for easy html parsing\n", "soup = BeautifulSoup(website_content, \"lxml\")\n", "my_table = soup.find( 'table', {'class':'wikitable sortable'} )\n", "\n", "# parse the table and convert to Python dictionary\n", "mytable_dict = { 'Rank':[], 'Name':[], 'Industry':[], 'Revenue':[], 'Revenue_Growth':[], 'Employees':[], 'Country':[] }\n", "\n", "mytable_header_list = ['Rank', 'Name', 'Industry', 'Revenue', 'Employees', 'Revenue_Growth']\n", "\n", "for eachrow in my_table('tr')[1:]:\n", " cells = eachrow(['th', 'td'])\n", " \n", " for idx, header in enumerate(mytable_header_list):\n", " mytable_dict[header].append( cells[idx].text.strip() )\n", " \n", " mytable_dict['Country'].append( cells[6].select('.flagicon > a')[0].get('title') )\n", " \n", "df = pd.DataFrame(mytable_dict)\n", "df.to_csv('companies_by_revenue.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RankNameIndustryRevenueRevenue_GrowthEmployeesCountry
01WalmartRetail$500,3432,300,0003.0%United States
12State GridElectricity$348,903913,54610.7%China
23SinopecOil and gas$326,953667,79322.2%China
34China National PetroleumOil and gas$326,0081,636,53224.2%China
45Royal Dutch ShellOil and gas$311,87084,00029.9%Netherlands
56ToyotaAutomotive$265,172369,1244.1%Japan
67VolkswagenAutomotive$260,028642,2928.2%Germany
78BPOil and gas$244,58274,00031.1%United Kingdom
89ExxonMobilOil and gas$244,36371,20017.4%United States
910Berkshire HathawayConglomerate$242,137377,0008.3%United States
1011AppleElectronics$229,234123,0006.3%United States
1112SamsungElectronics$211,940320,67121.8%South Korea
1213McKessonHealthcare$208,35768,0004.9%United States
1314GlencoreMiningCommodities$205,47682,68118.2%Switzerland
1415UnitedHealthHealthcare$201,159260,0009.3%United States
1516DaimlerAutomotive$185,235289,3219.3%Germany
1617CVS HealthHealthcare$184,765203,0004.1%United States
1718AmazonRetail$177,866566,00030.8%United States
1819ExorFinancials$161,677307,6374.4%Italy
1920AT&TTelecommunications$160,546254,0002.0%United States
2021General MotorsAutomotive$157,311180,0005.5%United States
2122FordAutomotive$156,776202,0003.3%United States
2223China State ConstructionConstruction$156,071270,4678.0%China
2324FoxconnElectronics$154,699803,12614.5%Taiwan
2425AmerisourceBergenPharmaceuticals$153,14419,5004.3%United States
2526Industrial & Commercial Bank of ChinaFinancials$153,021453,0483.6%China
2627AXAFinancials$149,46195,7284.0%France
2728TotalOil and gas$149,09998,27716.6%France
2829Ping An InsuranceFinancials$144,197342,55023.7%China
2930HondaAutomotive$138,646215,6387.3%Japan
3031China Construction BankFinancials$138,594370,4152.6%China
3132TrafiguraCommodities$136,4213,93539.1%Singapore
3233ChevronOil and gas$134,53351,90025.1%United States
3334Cardinal HealthPharmaceuticals$129,97640,4006.9%United States
3435CostcoRetail$129,025182,0008.7%United States
3536SAIC MotorAutomotive$128,819148,76713.1%China
3637VerizonTelecommunications$126,034155,4000.0%United States
3738AllianzFinancials$123,532140,5531.1%Germany
3839KrogerRetail$122,662449,0006.4%United States
3940Agricultural Bank of ChinaFinancials$122,366491,5784.3%China
4041General ElectricConglomerate$122,274313,0003.5%United States
4142China Life InsuranceFinancials$120,224170,51714.7%China
4243Walgreens Boots AllianceRetailPharmaceuticals$118,214290,0000.7%United States
4344BNP ParibasFinancials$117,375189,5097.7%France
4445Japan Post HoldingsConglomerate$116,616245,8635.2%Japan
4546Bank of ChinaFinancials$115,423311,1331.5%China
4647JPMorgan ChaseFinancials$113,899252,5398.0%United States
4748Fannie MaeFinancials$112,3947,2004.9%United States
4849GazpromOil and gas$111,983469,60022.5%Russia
4950PrudentialFinancials$111,45824,71114.9%United Kingdom
\n", "
" ], "text/plain": [ " Rank Name Industry \\\n", "0 1 Walmart Retail \n", "1 2 State Grid Electricity \n", "2 3 Sinopec Oil and gas \n", "3 4 China National Petroleum Oil and gas \n", "4 5 Royal Dutch Shell Oil and gas \n", "5 6 Toyota Automotive \n", "6 7 Volkswagen Automotive \n", "7 8 BP Oil and gas \n", "8 9 ExxonMobil Oil and gas \n", "9 10 Berkshire Hathaway Conglomerate \n", "10 11 Apple Electronics \n", "11 12 Samsung Electronics \n", "12 13 McKesson Healthcare \n", "13 14 Glencore MiningCommodities \n", "14 15 UnitedHealth Healthcare \n", "15 16 Daimler Automotive \n", "16 17 CVS Health Healthcare \n", "17 18 Amazon Retail \n", "18 19 Exor Financials \n", "19 20 AT&T Telecommunications \n", "20 21 General Motors Automotive \n", "21 22 Ford Automotive \n", "22 23 China State Construction Construction \n", "23 24 Foxconn Electronics \n", "24 25 AmerisourceBergen Pharmaceuticals \n", "25 26 Industrial & Commercial Bank of China Financials \n", "26 27 AXA Financials \n", "27 28 Total Oil and gas \n", "28 29 Ping An Insurance Financials \n", "29 30 Honda Automotive \n", "30 31 China Construction Bank Financials \n", "31 32 Trafigura Commodities \n", "32 33 Chevron Oil and gas \n", "33 34 Cardinal Health Pharmaceuticals \n", "34 35 Costco Retail \n", "35 36 SAIC Motor Automotive \n", "36 37 Verizon Telecommunications \n", "37 38 Allianz Financials \n", "38 39 Kroger Retail \n", "39 40 Agricultural Bank of China Financials \n", "40 41 General Electric Conglomerate \n", "41 42 China Life Insurance Financials \n", "42 43 Walgreens Boots Alliance RetailPharmaceuticals \n", "43 44 BNP Paribas Financials \n", "44 45 Japan Post Holdings Conglomerate \n", "45 46 Bank of China Financials \n", "46 47 JPMorgan Chase Financials \n", "47 48 Fannie Mae Financials \n", "48 49 Gazprom Oil and gas \n", "49 50 Prudential Financials \n", "\n", " Revenue Revenue_Growth Employees Country \n", "0 $500,343 2,300,000 3.0% United States \n", "1 $348,903 913,546 10.7% China \n", "2 $326,953 667,793 22.2% China \n", "3 $326,008 1,636,532 24.2% China \n", "4 $311,870 84,000 29.9% Netherlands \n", "5 $265,172 369,124 4.1% Japan \n", "6 $260,028 642,292 8.2% Germany \n", "7 $244,582 74,000 31.1% United Kingdom \n", "8 $244,363 71,200 17.4% United States \n", "9 $242,137 377,000 8.3% United States \n", "10 $229,234 123,000 6.3% United States \n", "11 $211,940 320,671 21.8% South Korea \n", "12 $208,357 68,000 4.9% United States \n", "13 $205,476 82,681 18.2% Switzerland \n", "14 $201,159 260,000 9.3% United States \n", "15 $185,235 289,321 9.3% Germany \n", "16 $184,765 203,000 4.1% United States \n", "17 $177,866 566,000 30.8% United States \n", "18 $161,677 307,637 4.4% Italy \n", "19 $160,546 254,000 2.0% United States \n", "20 $157,311 180,000 5.5% United States \n", "21 $156,776 202,000 3.3% United States \n", "22 $156,071 270,467 8.0% China \n", "23 $154,699 803,126 14.5% Taiwan \n", "24 $153,144 19,500 4.3% United States \n", "25 $153,021 453,048 3.6% China \n", "26 $149,461 95,728 4.0% France \n", "27 $149,099 98,277 16.6% France \n", "28 $144,197 342,550 23.7% China \n", "29 $138,646 215,638 7.3% Japan \n", "30 $138,594 370,415 2.6% China \n", "31 $136,421 3,935 39.1% Singapore \n", "32 $134,533 51,900 25.1% United States \n", "33 $129,976 40,400 6.9% United States \n", "34 $129,025 182,000 8.7% United States \n", "35 $128,819 148,767 13.1% China \n", "36 $126,034 155,400 0.0% United States \n", "37 $123,532 140,553 1.1% Germany \n", "38 $122,662 449,000 6.4% United States \n", "39 $122,366 491,578 4.3% China \n", "40 $122,274 313,000 3.5% United States \n", "41 $120,224 170,517 14.7% China \n", "42 $118,214 290,000 0.7% United States \n", "43 $117,375 189,509 7.7% France \n", "44 $116,616 245,863 5.2% Japan \n", "45 $115,423 311,133 1.5% China \n", "46 $113,899 252,539 8.0% United States \n", "47 $112,394 7,200 4.9% United States \n", "48 $111,983 469,600 22.5% Russia \n", "49 $111,458 24,711 14.9% United Kingdom " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }