Python-web
relevant library
- urllib2
- scrapy
- beautifulsoup
- requests
Quick start
use requests and beautifulsoup:
- import library
import requests
import bs4
# use URL
url = 'http://www.idehe.com' # wordpress blog
# requests html
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
# demo print
print soup.prettify
print soup.title
print soup.head
- for any <xxx> tags
print soup.find_all('h1')
print soup.find_all('p')
print soup.find_all('button')
print soup.find_all('title')
print soup.find(id="post-870")
print soup.find_all('a')[2]
- find all the articles
print soup.find_all('article')[2]