Matthieu Choplin
<html><head><meta http-equiv="Content-Type"
content="text/html; charset=windows-1252">
<title>Profile: Aphrodite</title>
<link rel="stylesheet" type="text/css"></head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="./Profile_ Aphrodite_files/aphrodite.gif">
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body></html>
from urllib.request import urlopen
my_address = "http://mattchoplin.com/python_city/practice/Profile_Aphrodite.htm"
html_page = urlopen(my_address)
html_text = html_page.read().decode('utf-8')
print(html_text)
What is the type of object that is returned?
from urllib.request import urlopen
my_address = "http://mattchoplin.com/python_city/" \
"practice/Profile_Aphrodite.htm"
html_page = urlopen(my_address)
html_text = html_page.read().decode('utf-8')
start_tag = "<title>"
end_tag = "</title>"
start_index = html_text.find(start_tag) + len(start_tag)
end_index = html_text.find(end_tag)
print(html_text[start_index:end_index])
from urllib.request import urlopen
my_address = "http://mattchoplin.com/python_city/" \
"practice/Profile_Poseidon.htm"
html_page = urlopen(my_address)
html_text = html_page.read().decode('utf-8')
start_tag = ""
end_tag = " "
start_index = html_text.find(start_tag) + len(start_tag)
end_index = html_text.find(end_tag)
print(html_text[start_index:end_index])
<head><meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
<title >Profile: Poseidon
# re.findall(<regular_expression>, <string_to_test>)
re.findall('ab*c', 'ABC') # nothing found
re.findall('ab*c', 'ABC', re.IGNORECASE) # ABC found
import re
match_results = re.search('ab*c', 'ABC', re.IGNORECASE)
print(match_results.group()) # returns ABC
my_string = "This is very boring"
print(my_string.replace('boring', 'funny'))
import re
print(re.sub('boring', 'WHAT?', my_string))
my_string = 'Everything is <replaced> if it is in <tags>'
my_string = re.sub('<.*>', 'BAR', my_string)
print(my_string) # 'Everything is BAR'
my_string = 'Everything is <replaced> if it is in <tags>'
my_string = re.sub('<.*?>', 'BAR', my_string)
print(my_string) # 'Everything is BAR if it is in BAR'
<TITLE >Profile: Dionysus</title / >
import re
from urllib.request import urlopen
my_address = "http://mattchoplin.com/python_city/practice/Profile_Dionysus.htm"
html_page = urlopen(my_address)
html_text = html_page.read().decode('utf-8')
match_results = re.search("<title .*?>.*</title .*?>", html_text, re.IGNORECASE)
title = match_results.group()
title = re.sub("<.*?>", "", title)
print(title)
from bs4 import BeautifulSoup
from urllib.request import urlopen
my_address = "http://mattchoplin.com/python_city/" \
"practice/Profile_Dionysus.htm"
html_page = urlopen(my_address)
html_text = html_page.read().decode('utf-8')
my_soup = BeautifulSoup(html_text, "html.parser")
print(my_soup.get_text())
print(my_soup.get_text().replace("\n\n\n",""))
print(my_soup.find_all("img"))
[<img src="dionysus.jpg"/>, <img src="grapes.png"><br><br>
Hometown: Mount Olympus
<br><br>
Favorite animal: Leopard <br>
<br>
Favorite Color: Wine
</br></br></br></br></br></br></img>]
for tag in my_soup.find_all("img"):
print(tag.name)
print(tag['src'])
print(my_soup.title)
print(my_soup.title.string)
Selector passed to the select method | Will match... |
---|---|
soup.select('div') | All elements named <div> |
soup.select('#author') | The element with an id attribute of author |
soup.select('.notice') | All elements that use a CSS |
soup.select('div span') | All elements named <span> that are within an element named <div> |
soup.select('div > span') | All elements named <span> that are directly within an element named <div>, with no other elements in between |
soup.select('input[name]') | All elements named <input> that have a name attribute with any value |
soup.select('input[type="button"]') | All elements named <input> that have an attribute name type with value button |
import mechanicalsoup
my_browser = mechanicalsoup.Browser(
soup_config={'features':'html.parser'})
page = my_browser.get("http://mattchoplin.com/python_city/" \
"practice/Profile_Aphrodite.htm")
print(page.soup)
import mechanicalsoup
my_browser = mechanicalsoup.Browser(
soup_config={'features':'html.parser'})
login_page = my_browser.get(
"https://whispering-reef-69172.herokuapp.com/login")
login_html = login_page.soup
form = login_html.select("form")[0]
form.select("input")[0]["value"] = "admin"
form.select("input")[1]["value"] = "default"
profiles_page = my_browser.submit(form, login_page.url)
print(profiles_page.url)
print(profiles_page.soup)
<div class="price">40.08</div>
import mechanicalsoup
my_browser = mechanicalsoup.Browser()
page = my_browser.get("https://www.bloomberg.com/quote/YHOO:US")
html_text = page.soup
# return a list of all the tags where
# the css class is 'price'
my_tags = html_text.select(".price")
# take the BeautifulSoup string out of the
# first (and only) <span> tag
my_price = my_tags[0].text
print("The current price of "
"YHOO is: {}".format(my_price))
from time import sleep
print "I'm about to wait for five seconds..."
sleep(5)
print "Done waiting!"
from time import sleep
import mechanicalsoup
my_browser = mechanicalsoup.Browser()
# obtain 1 stock quote per minute for the next 3 minutes
for i in range(0, 3):
page = my_browser.get("https://www.bloomberg.com/quote/YHOO:US")
html_text = page.soup
# return a list of all the tags where the class is 'price'
my_tags = html_text.select(".price")
# take the BeautifulSoup string out of the first tag
my_price = my_tags[0].text
print("The current price of YHOO is: {}".format(my_price))
if i<2: # wait a minute if this isn't the last request
sleep(60)
import requests
res = requests.get(url)
res.raise_for_status()