BeautifulSoup 예제 1
urljoin 사용법
from urllib.parse import urljoin
baseUrl = "http://test.com/html/a.html"
print( urljoin(baseUrl, "b.html") )
print( urljoin(baseUrl, "sub/c.html") )
print( urljoin(baseUrl, "../index.html") )
print( urljoin(baseUrl, "../img/hoge.png") )
print( urljoin(baseUrl, "../css/hoge.css") )
http://test.com/html/b.html
http://test.com/html/sub/c.html
http://test.com/index.html
http://test.com/img/hoge.png
http://test.com/css/hoge.css
태그선택자
from bs4 import BeautifulSoup
html = """
<html><body>
<ul>
<li><a href="http://www.naver.com">naver</a></li>
<li><a href="http://www.daum.net">daum</a></li>
<li><a href="https://www.google.com">google</a></li>
<li><a href="https://www.tistory.com">tistory</a></li>
</ul>
</body></html>
"""
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all("a")
print(links)
[<a href="http://www.naver.com">naver</a>, <a href="http://www.daum.net">daum</a>, <a href="https://www.google.com">google</a>, <a href="https://www.tistory.com">tistory</a>]
for a in links:
print('a' ,type(a) ,a )
href = a['href'] # 또는 A.attrs['href']
print('href',href)
print('txt',a.text,) #또는 a.string
a <class 'bs4.element.Tag'> <a href="http://www.naver.com">naver</a>
href http://www.naver.com
txt naver
a <class 'bs4.element.Tag'> <a href="http://www.daum.net">daum</a>
href http://www.daum.net
txt daum
a <class 'bs4.element.Tag'> <a href="https://www.google.com">google</a>
href https://www.google.com
txt google
a <class 'bs4.element.Tag'> <a href="https://www.tistory.com">tistory</a>
href https://www.tistory.com
txt tistory
a = soup.find_all("a", string='daum') # a 태그 중 문자가 daum 인 것
a
[<a href="http://www.daum.net">daum</a>]
for i in a:
print(i.text)
daum
b= soup.find_all("a", limit=3) #limit=0 무제한
b
[<a href="http://www.naver.com">naver</a>,
<a href="http://www.daum.net">daum</a>,
<a href="https://www.google.com">google</a>]
c = soup.find_all(string=['naver']) #보통은 정규표현식으로 사용
c
['naver']
css selector
css 설명서
css 연습소
html = """
<html><body>
<div id="main">
<h1>강의목록</h1>
<ul class="lecs">
<li>Java 초고수 되기</li>
<li>파이썬 기초 프로그래밍</li>
<li>파이썬 머신러닝 프로그래밍</li>
<li>안드로이드 블루투스 프로그래밍</li>
</ul>
</div>
</body></html>
"""
soup = BeautifulSoup(html, 'html.parser')
h1 = soup.select("div#main > h1")
print(h1)
[<h1>강의목록</h1>]
print(h1[0].text)
강의목록
h11= soup.select_one("div#main > h1")
print(h11.text)
강의목록
list_li = soup.select("#main > ul.lecs > li")
for li in list_li:
print(li.text)
Java 초고수 되기
파이썬 기초 프로그래밍
파이썬 머신러닝 프로그래밍
안드로이드 블루투스 프로그래밍
Leave a Comment