크롤링 프로젝트 1
1. 스크레핑
- 웹 사이트에 있는 특정 정보를 추출하는 기술
2. 크롤링
- 프로그램이 웹 사이트를 정기적으로 돌며 정보를 추출하는 기술
--------------------------------------------------------------------------------------
스크레핑 과정
1. 대상 URL 할당 (URL 지정)
2. 웹 문서 추출 및 인코딩 :
req = urllib.request.urlopen(url)
encoding = req.info().get_content_charset(failobj="utf-8")
html = req.read().decode(encoding)
3. csv 저장모드로 오픈
with open("hanbit book list.csv", 'w', newline='') as f:
writer = csv.writer(f) #한줄씩 입력
writer.writerow(["Title", "URL"]) #Title과 URL 제목 라인
4. 특정 태그의 Data 추출 및 정제
for partial_html in re.findall(r'<p class="book_tit"><a.*?</p>', html, re.DOTALL):
url = re.search(r'<a href="(.*?)">', partial_html).group(1)
book_url = "http://www.hanbit.co.kr" + url
title = re.sub(r'<.*?>','', partial_html)
title = unescape(title)
5. Data 입력
writer.writerow([title, book_url])
코드
import re
import urllib.request
from html import unescape
import csv
#1
url = "http://www.hanbit.co.kr/store/books/new_book_list.html"
req = urllib.request.urlopen(url)
header = req.info()
encoding = req.info().get_content_charset(failobj="utf-8")
html = req.read().decode(encoding)
with open("hanbit book list.csv", 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(["Title", "URL"])
for partial_html in re.findall(r'<p class="book_tit"><a.*?</p>', html, re.DOTALL):
url = re.search(r'<a href="(.*?)">', partial_html).group(1)
book_url = "http://www.hanbit.co.kr" + url
title = re.sub(r'<.*?>','', partial_html)
title = unescape(title)
writer.writerow([title, book_url])
print("완료")
sql버젼
입력 코드
import re, sqlite3
from urllib.request import urlopen
from html import unescape
def main():
html = fetch("http://www.hanbit.co.kr/store/books/full_book_list.html")
books = scrape(html)
save("books.db",books)
def fetch(url):
req = urlopen(url)
encoding = req.info().get_content_charset(failobj="utf-8")
html = req.read().decode(encoding)
return html
def scrape(html):
books = []
for partial_html in re.findall(r'<td class="left"><a.*?</td>', html, re.DOTALL):
url = re.search(r'<a href="(.*?)">', partial_html).group(1)
url = "http://www.hanbit.co.kr" + url
title = re.sub(r'<.*?>', '', partial_html)
title = unescape(title)
books.append({"url":url, "title":title})
return books
def save(db_path, books):
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute("DROP TABLE IF EXISTS books")
c.execute(""" CREATE TABLE books(title text,url text)""")
c.executemany("INSERT INTO books VALUES (:title, :url)", books)
conn.commit()
conn.close()
if __name__ == "__main__":
main()
print("완료")
SQLite 출력 코드
import sqlite3
conn = sqlite3.connect("books.db")
c = conn.cursor()
c.execute("SELECT * FROM books")
for row in c:
print(row)
conn.close()
print("완료")