웹스크래핑 추가 공부

HTML에 대한 기초지식 이해

<html>
    <head>
        <meta charset="utf-8">
        <title>나도코딩 홈페이지</title>
    </head>
    <body>
        <input type="text" value="아이디를 입력하세요">
        <input type="password">
        <input type="button" value="로그인">
        <a href="http://google.com">구글로 이동하기</a> 
    </body>
</html>

xpath

/학교/학년/반/학생[2]
//*[@학번="1-1-5"] 학번이라는 속성이 하나밖에 없을 때

requests

import requests
#res = requests.get("http://naver.com")
res = requests.get("http://nadocoding.tistory.com")
#print("응답코드: ", res.status_code) #200이면 정상

#if res.status_code == requests.codes.ok:
#    print("정상입니다")
#else:
#    print("문제가 생겼스빈다. [에러코드 ", res.status_code, "]")

res.raise_for_status() 
#print("웹 스크래핑을 진행합니다")

res = requests.get("http://nadocoding.tistory.com")

res.raise_for_status()

짝궁처럼 두개를 같이 써주면 된다. 스크래핑이 오류가 났는 지 확인하는

requests에서 원하는 페이지에 접속, 원하는 정보를 받았는 지 확인이 가능하다. 파일로 만드는 것 까지 가능

정규식

차뒤 번호판 형식이 맞는지?

주민등록 번호 형식이 맞는지 ? 확인하는 것이 정규식

차뒤 번호판이 4자리인데 3자리밖에 생각이 안난다면 requests로 검색을 하면된다

import re
p=re.compile("ca.e")
# . (ca.e): 하나의 문자를 의미 care, cafe, case
# ^ (^de): 문자열의 시작 desk, destination
# $ (se$): 문자열의 끝 case, base

def print_match(m):
    if m:
        print(m.group())
    else:
        print("매칭되지 않음.")

m = p.match("carelsee") #주어진 문자열이 처음부터 일치하는 지 확인
print_match(m)
#print(m.group()) #매치되지 않으면 에러남

- search: 주어진 문자열 중에 일치하는 게 있는지

import re
p=re.compile("ca.e")
# . (ca.e): 하나의 문자를 의미 care, cafe, case
# ^ (^de): 문자열의 시작 desk, destination
# $ (se$): 문자열의 끝 case, base

def print_match(m):
    if m:
        print("m.group()",m.group()) #일치하는 문자열을 반환
        print("m.string:", m.string) #입력받은 문자열
        print("m.start():", m.start()) #일치하는 문자열의 시작 index
        print("m.end():", m.end()) #일치하는 문자열의 끝 index
        print("m.span():", m.span()) #일치하는문자열의 시작/ 끝
    else:
        print("매칭되지 않음.")

#m = p.match("carelsee") #주어진 문자열이 처음부터 일치하는 지 확인
#print_match(m)
#print(m.group()) #매치되지 않으면 에러남

m = p.search("careless") #search: 주어진 문자열 중에 일치하는 게 있는지
print_match(m)

- #findall: 일치하는 모든 것을 리스트 형태로

import re
p=re.compile("ca.e")
# . (ca.e): 하나의 문자를 의미 care, cafe, case
# ^ (^de): 문자열의 시작 desk, destination
# $ (se$): 문자열의 끝 case, base

def print_match(m):
    if m:
        print("m.group()",m.group()) #일치하는 문자열을 반환
        print("m.string:", m.string) #입력받은 문자열
        print("m.start():", m.start()) #일치하는 문자열의 시작 index
        print("m.end():", m.end()) #일치하는 문자열의 끝 index
        print("m.span():", m.span()) #일치하는문자열의 시작/ 끝
    else:
        print("매칭되지 않음.")

lst = p.findall("careless") #findall: 일치하는 모든 것을 리스트 형태로
print(lst)

# . (ca.e): 하나의 문자를 의미 care, cafe, case

# ^ (^de): 문자열의 시작 desk, destination

# $ (se$): 문자열의 끝 case, base

Beautifulsoup4: bs4

import requests
from bs4 import BeautifulSoup
url = "https://comic.naver.com/index" 
res = requests.get(url)
res.raise_for_status()

soup = BeautifulSoup(res.text, "lxml") #우리가 가져온 html 문서를 -> 뷰티풀숲 객체로 만든다.
print(soup.title)
print(soup.title.get_text)

print(soup.a) #soup객체에서 처음 발견되는 a 출력
print(soup.a.attrs) # a element의 속성 정보를 출력
print(soup.a["href"]) # a element의 herf속성 값 정보를 출력

print(soup.find("a", attrs={"class":"Nbtn_upload"})) #클래스가 Nbtn upload인 a element를 찾아라
print(soup.find(attrs={"class":"Nbtn_upload"})) #클래스가 Nbtn upload인 어떤 element를 찾아라

print(soup.find("li",attrs={"class":"rank01"}))

rank1 = soup.find("li",attrs={"class":"rank01"}) #위에꺼에서 첫번째 a element만 출력
print(rank1.a)

- 형제 element 가져오는 법

import requests
from bs4 import BeautifulSoup
url = "https://comic.naver.com/index" 
res = requests.get(url)
res.raise_for_status()

soup = BeautifulSoup(res.text, "lxml") #우리가 가져온 html 문서를 -> 뷰티풀숲 객체로 만든다.
print(soup.title)
print(soup.title.get_text)


rank1 = soup.find("li",attrs={"class":"rank01"})
print(rank1.a.get_text())
#print(rank1.next_sibling)
rank2 = rank1.next_sibling.next_sibling
rank3 = rank2.next_sibling.next_sibling
print(rank3.a.get_text())

- 띄어쓰기를 고려안하고 찾고 싶다면

rank2 = rank1.find_next_sibling("li") #중간에 띄어쓰기 상관없이 찾기 가능
rank3 = rank2.find_next_sibling("li") #중간에 띄어쓰기 상관없이 찾기 가능

- 형제들을 모두 가지고 오고 싶다면?

print(rank1.find_next_siblings("li")) #rank1 기준 다음 형제들 모두 가져오기

- 이런 것도 가능하다.

webtoon = soup.find("a", text="전지적 독자 시점-110. Ep. 22 세 가지 약속 (3)")
print(webtoon)

쿠팡예시 - 페이지 넘김

- GET방식: 메뉴를 클릭할 때 주소 값(숫자)들이 변경되는 경우

- POST방식: 비밀번호를 치고 들어가거나, 게시판에 글을 남길 때

from operator import attrgetter
import requests
import re
from bs4 import BeautifulSoup
url = "https://www.coupang.com/np/search?q=%EB%85%B8%ED%8A%B8%EB%B6%81&channel=user&component=&eventCategory=SRP&trcid=&traid=&sorter=scoreDesc&minPrice=&maxPrice=&priceRange=&filterType=&listSize=36&filter=&isPriceRange=false&brand=&offerCondition=&rating=0&page=1&rocketAll=false&searchIndexingToken=1=6&backgroundColor="
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"}
res = requests.get(url, headers=headers)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")

items = soup.find_all("li", attrs = {"class":re.complile("^search-product")})
for item in items:

    #광고제품은 제외
    ad_badge = item.find("span", attrs={"class":"ad_badge-text"})
    if ad_badge:
        print("<광고상품은 제외합니다.>")
        continue
    name = item.find("div", attrs={"class":"name"}).get_text()
    price = item.find("strong", attrs={"class":"price-value"}).get_text() #가격
    # 리뷰 100개 이상, 평점 4.5이상만 조회
    rate = item.find("em", attrs={"class":"rating"})
    if rate: # 평점
        rate = rate.get_text()
    else:
        rate ="평점 없는 상품은 제외합니다"
        continue
    rate_cnt = item.find("span", attrs={"class":"rating-total-count"}).get_text()# 평점 수
    rate_cnt = rate_cnt[1:-1]
    
    if float(rate)>4.5 and int(rate_cnt)>=50:
        print(name,price,rate,rate_cnt)

- 최근 5개 페이지에서 제품을 조회하는 법. 페이지 넘겨가며 조회하는 방법

HTML에 대한 기초지식 이해

requests

정규식

Beautifulsoup4: bs4

쿠팡예시 - 페이지 넘김

티스토리툴바