๐Ÿ“ฆ๋ถ„์„ ํ”„๋กœ์ ํŠธ/๐Ÿš… ํฌ๋กค๋ง ๋ถ„์„

๐Ÿš… ํฌ๋กค๋ง ๋ถ„์„ ํ”„๋กœ์ ํŠธ (4) - ์ฝ”๋ ˆ์ผ ๊ธฐ์ฐจํ‘œ ํฌ๋กค๋ง

๋ฐ์ดํ„ฐํŒ์Šค 2024. 10. 17. 16:22

์ฝ”๋ ˆ์ผ ๊ธฐ์ฐจํ‘œ ํฌ๋กค๋ง

robots.txt๋ฅผ ํ†ตํ•ด ํ•ด๋‹น ์‚ฌ์ดํŠธ๋Š” ํฌ๋กค๋ง์ด ๊ฐ€๋Šฅํ•จ์„ ํ™•์ธํ–ˆ์Šต๋‹ˆ๋‹ค.

ํ•˜์ง€๋งŒ ํฌ๋กค๋ง์„ ๊ณผ๋„ํ•˜๊ฒŒ ํ•˜๋Š” ๊ฒƒ์€ ์„œ๋ฒ„์— ๋ถ€๋‹ด์„ ์ค„ ์ˆ˜ ์žˆ๊ธฐ์—,

ํ•„์š”ํ•œ ๋ฐ์ดํ„ฐ๋งŒ ์ •์˜ํ•˜๊ณ  ๊ทธ ๋ถ€๋ถ„๋งŒ ์ž‘์—…ํ•˜๋Š” ๊ฒƒ์ด ์ข‹์Šต๋‹ˆ๋‹ค.

 

 

import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait

# selenium ์›น ๋“œ๋ผ์ด๋ฒ„ ์„œ๋น„์Šค ์„ค์ •
service=Service(executable_path=ChromeDriverManager().install())
url='https://www.letskorail.com/ebizprd/EbizPrdTicketPr21111_i1.do?&txtGoAbrdDt=20241118&txtGoHour=071600&selGoYear=2024&selGoMonth=11&selGoDay=18&selGoHour=00&txtGoPage=2&txtGoStartCode=0001&txtGoStart=%EC%84%9C%EC%9A%B8&txtGoEndCode=0020&txtGoEnd=%EB%B6%80%EC%82%B0&selGoTrain=05&selGoRoom=&selGoRoom1=&txtGoTrnNo=&useSeatFlg=&useServiceFlg=&selGoSeat=&selGoService=&txtPnrNo=&hidRsvChgNo=&hidStlFlg=&radJobId=1&SeandYo=&hidRsvTpCd=03&selGoSeat1=015&selGoSeat2=&txtPsgCnt1=1&txtPsgCnt2=0&txtMenuId=11&txtPsgFlg_1=1&txtPsgFlg_2=0&txtPsgFlg_3=0&txtPsgFlg_4=0&txtPsgFlg_5=0&txtPsgFlg_8=0&chkCpn=N&txtSeatAttCd_4=015&txtSeatAttCd_3=000&txtSeatAttCd_2=000&txtGoStartCode2=&txtGoEndCode2=&hidDiscount=&hidEasyTalk=&adjcCheckYn=N'


driver=webdriver.Chrome(service=service)
driver.get(url)
wait=WebDriverWait(driver,10) # ์›น ํŽ˜์ด์ง€๊ฐ€ ๋กœ๋“œ๋  ๋•Œ๊นŒ์ง€, ์ตœ๋Œ€ 10์ดˆ๊ฐ„ ๊ธฐ๋‹ค๋ฆฐ๋‹ค๋Š” ๋œป


# Selenium์œผ๋กœ javascript ์‹คํ–‰๋œ ํ›„์˜ ํŽ˜์ด์ง€ ์†Œ์Šค๋ฅผ ๊ฐ€์ ธ์˜ด
html=driver.page_source
driver.quit()

# BeautifulSoup ์œผ๋กœ ํŒŒ์‹ฑ
soup = BeautifulSoup(html,'html.parser')

table_contents=soup.find('tbody')
table_contents

๊ฒฐ๊ณผ๋ฌผ์ด ๊ต‰์žฅํžˆ ๋ณต์žกํ•˜๊ฒŒ ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.

ํ•ด๋‹น ๋‚ด์šฉ๋“ค์„ ์ฒ˜๋ฆฌํ•ด์„œ ๋ณด๊ธฐ ์‰ฝ๊ฒŒ ๋งŒ๋“ค์–ด ๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค.

 

 

# ๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•  ๋นˆ ๋ฆฌ์ŠคํŠธ ์ƒ์„ฑ
data_rows=[]

# table_contents์˜ ๊ฐ tr ํƒ(ํ–‰)์— ๋Œ€ํ•ด์„œ ๋ฐ˜๋ณตํ•˜๋ฉด์„œ, td ๋ฐ์ดํ„ฐ๋ฅผ ์ปฌ๋Ÿผ์— ๋‹ด๊ธฐ
for tr in table_contents.find_all('tr'):
    # ๊ฐ ์—ด์— ํ•ด๋‹นํ•˜๋Š” ๋ฐ์ดํ„ฐ ์ถ”์ถœ
    data=[]
    for td in tr.find_all('td'):
        text=td.get_text(strip=True)
        # ์—ด์— ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
        data.append(text)
    #๋ฐ์ดํ„ฐ๋ฅผ ํ–‰์œผ๋กœ ์ถ”๊ฐ€
    data_rows.append(data)

# ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์ƒ์„ฑ
# column ์ด๋ฆ„
columns=[
    '๊ตฌ๋ถ„','์—ด์ฐจ๋ฒˆํ˜ธ','์ถœ๋ฐœ์‹œ๊ฐ','๋„์ฐฉ์‹œ๊ฐ','ํŠน์‹ค/์šฐ๋“ฑ์‹ค','์ผ๋ฐ˜์‹ค','์œ ์•„',
    '์ž์œ ์„/์ž…์„','์ธํ„ฐ๋„ทํŠน๊ฐ€(๋ฉค๋ฒ„์‹ญํ˜œํƒ)','์˜ˆ์•ฝ๋Œ€๊ธฐ','์ •์ฐจ์—ญ(๊ฒฝ์œ )','์ฐจ๋Ÿ‰์œ ํ˜•/ํŽธ์„ฑ์ •๋ณด','์šด์ž„์š”๊ธˆ','์†Œ์š”์‹œ๊ฐ„'
]

df=pd.DataFrame(data_rows,columns=columns)
df

์ด๋ ‡๊ฒŒ ํŠน์‹ค/์šฐ๋“ฑ์‹ค๊ณผ ์ผ๋ฐ˜์‹ค ์นธ ๋“ฑ์ด ๋น„์–ด ์žˆ๋Š” ๊ฒƒ์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

 

 

 

๊ฐœ๋ฐœ์ž๋„๊ตฌ๋ฅผ ์—ด์–ด๋ณด์‹œ๋ฉด ๊ทธ ์ด์œ ๋ฅผ ์•Œ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

ํ•ด๋‹น ์นธ์ด ์ด๋ฏธ์ง€๋กœ ๋˜์–ด์žˆ๊ณ  alt="์˜ˆ์•ฝํ•˜๊ธฐ"๋กœ ๊ตฌ์„ฑ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.

 

 

 

# ๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•  ๋นˆ ๋ฆฌ์ŠคํŠธ ์ƒ์„ฑ
data_rows=[]

# table_contents์˜ ๊ฐ tr ํƒ(ํ–‰)์— ๋Œ€ํ•ด์„œ ๋ฐ˜๋ณตํ•˜๋ฉด์„œ, td ๋ฐ์ดํ„ฐ๋ฅผ ์ปฌ๋Ÿผ์— ๋‹ด๊ธฐ
for tr in table_contents.find_all('tr'):
    # ๊ฐ ์—ด์— ํ•ด๋‹นํ•˜๋Š” ๋ฐ์ดํ„ฐ ์ถ”์ถœ
    data=[]
    for td in tr.find_all('td'):
        # td์•ˆ์— img ํƒœ๊ทธ๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ, alt ์†์„ฑ ์ถ”์ถœ
        img_tag=td.find('img')
        #img_tage๊ฐ€ ์กด์žฌํ•˜๋ฉด
        if img_tag:
            text=img_tag.get('alt','')
        else:    
            text=td.get_text(strip=True)
        # ์—ด์— ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
        data.append(text)
    #๋ฐ์ดํ„ฐ๋ฅผ ํ–‰์œผ๋กœ ์ถ”๊ฐ€
    data_rows.append(data)

# ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์ƒ์„ฑ
# column ์ด๋ฆ„
columns=[
    '๊ตฌ๋ถ„','์—ด์ฐจ๋ฒˆํ˜ธ','์ถœ๋ฐœ์‹œ๊ฐ','๋„์ฐฉ์‹œ๊ฐ','ํŠน์‹ค/์šฐ๋“ฑ์‹ค','์ผ๋ฐ˜์‹ค','์œ ์•„',
    '์ž์œ ์„/์ž…์„','์ธํ„ฐ๋„ทํŠน๊ฐ€(๋ฉค๋ฒ„์‹ญํ˜œํƒ)','์˜ˆ์•ฝ๋Œ€๊ธฐ','์ •์ฐจ์—ญ(๊ฒฝ์œ )','์ฐจ๋Ÿ‰์œ ํ˜•/ํŽธ์„ฑ์ •๋ณด','์šด์ž„์š”๊ธˆ','์†Œ์š”์‹œ๊ฐ„'
]

df=pd.DataFrame(data_rows,columns=columns)
df

์ด๋ฒˆ์—” ํ•ด๋‹น ์ •๋ณด๋“ค์ด ๋ชจ๋‘ ์ œ๋Œ€๋กœ ๋“ค์–ด๊ฐ”์Šต๋‹ˆ๋‹ค.

 

 

 

 

์—ฌ๋Ÿฌ ํŽ˜์ด์ง€ ๊ธฐ์ฐจ ์˜ˆ๋งค ํฌ๋กค๋ง

๊ฐœ๋ฐœ์ž๋„๊ตฌ๋ฅผ ์—ฌ์‹  ๋‹ค์Œ์— ๋‹ค์Œ์„ ๋ˆ„๋ฅธ ๋‹ค์Œ์— ํ•ด๋‹น์ค„์„ ์šฐํด๋ฆญ ํ•ด์ฃผ์„ธ์š”

๊ทธ๋Ÿผ Copy > Copy Xpath๋ฅผ ํด๋ฆญํ•ด์ฃผ์„ธ์š”

Xpath์˜ ๋‹ค์Œ ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜์—ฌ, ์ ‘์†๋œ ํŽ˜์ด์ง€์˜ url์„ ๊ฐ€์ ธ์˜ค๋Š” ๊ฒƒ ๊นŒ์ง€ ์ˆ˜ํ–‰ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค.

 

# ํŽ˜์ด์ง€๋„ค์ด์…˜ : ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ•œ ๋ฒ„์ ผ
# URL๊นŒ์ง€ ํฌํ•จ๋œ ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์ƒ์„ฑ
# Xpath, '๋‹ค์Œ' ๋ฒ„ํŠผ ํด๋ฆญ, ์ ‘์†๋œ ํŽ˜์ด์ง€์˜ url์„ ๊ฐ€์ ธ์˜ค๋Š” ๊ฒƒ ๊นŒ์ง€ ์ˆ˜ํ–‰
from selenium.webdriver.common.by import By
import time


# selenium ์›น ๋“œ๋ผ์ด๋ฒ„ ์„œ๋น„์Šค ์„ค์ •
service=Service(executable_path=ChromeDriverManager().install())
driver=webdriver.Chrome(service=service)
url='https://www.letskorail.com/ebizprd/EbizPrdTicketPr21111_i1.do?&txtGoAbrdDt=20241020&txtGoHour=065700&selGoYear=2024&selGoMonth=10&selGoDay=20&selGoHour=00&txtGoPage=2&txtGoStartCode=0001&txtGoStart=%EC%84%9C%EC%9A%B8&txtGoEndCode=0020&txtGoEnd=%EB%B6%80%EC%82%B0&selGoTrain=05&selGoRoom=&selGoRoom1=&txtGoTrnNo=&useSeatFlg=&useServiceFlg=&selGoSeat=&selGoService=&txtPnrNo=&hidRsvChgNo=&hidStlFlg=&radJobId=1&SeandYo=&hidRsvTpCd=03&selGoSeat1=015&selGoSeat2=&txtPsgCnt1=1&txtPsgCnt2=0&txtMenuId=11&txtPsgFlg_1=1&txtPsgFlg_2=0&txtPsgFlg_3=0&txtPsgFlg_4=0&txtPsgFlg_5=0&txtPsgFlg_8=0&chkCpn=N&txtSeatAttCd_4=015&txtSeatAttCd_3=000&txtSeatAttCd_2=000&txtGoStartCode2=&txtGoEndCode2=&hidDiscount=&hidEasyTalk=&adjcCheckYn=N'


# ๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•  ๋นˆ ๋ฆฌ์ŠคํŠธ ์ƒ์„ฑ
data_rows=[]

# 5๋ฒˆ ๋ฐ˜๋ณต
for _ in range(5):
#Selenium ์›น ๋“œ๋ผ์ด๋ฒ„ ์‹คํ–‰
    driver.get(url) # ๊ฐฑ์‹ ๋œ url ๋ฐ˜๋ณต์ ์œผ๋กœ ์ „๋‹ฌ
    time.sleep(2) # 2์ดˆ ๋Œ€๊ธฐ

    # Selenium์œผ๋กœ javascript ์‹คํ–‰๋œ ํ›„์˜ ํŽ˜์ด์ง€ ์†Œ์Šค๋ฅผ ๊ฐ€์ ธ์˜ด
    html=driver.page_source


    # BeautifulSoup ์œผ๋กœ ํŒŒ์‹ฑ
    soup = BeautifulSoup(html,'html.parser')

    #table_contents๋กœ ํ•„์š”ํ•œ ๋ถ€๋ถ„๋งŒ ์ •์˜
    table_contents=soup.find('tbody')


    # table_contents์˜ ๊ฐ tr ํƒ(ํ–‰)์— ๋Œ€ํ•ด์„œ ๋ฐ˜๋ณตํ•˜๋ฉด์„œ, td ๋ฐ์ดํ„ฐ๋ฅผ ์ปฌ๋Ÿผ์— ๋‹ด๊ธฐ
    for tr in table_contents.find_all('tr'):
        # ๊ฐ ์—ด์— ํ•ด๋‹นํ•˜๋Š” ๋ฐ์ดํ„ฐ ์ถ”์ถœ
        data=[]
        for td in tr.find_all('td'):
            # td์•ˆ์— img ํƒœ๊ทธ๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ, alt ์†์„ฑ ์ถ”์ถœ
            img_tag=td.find('img')
            #img_tage๊ฐ€ ์กด์žฌํ•˜๋ฉด
            if img_tag:
                text=img_tag.get('alt','')
            else:    
                text=td.get_text(strip=True)
            # ์—ด์— ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
            data.append(text)
        #data, ์ฆ‰ ๋ฐฉ๊ธˆ๊นŒ์ง€ td ํƒœ๊ทธ๋“ค์ด ์Œ“์ธ data ๋ฆฌ์ŠคํŠธ์— url๋„ ํ•˜๋‚˜ ๋” ์ถ”๊ฐ€
        data.append(url)
        #๋ฐ์ดํ„ฐ๋ฅผ ํ–‰์œผ๋กœ ์ถ”๊ฐ€
        data_rows.append(data)

    #'๋‹ค์Œ' ๋ฒ„ํŠผ ์ฐพ์•„์„œ ํด๋ฆญ
    try:
        next_button=driver.find_element(By.XPATH,'//*[@id="divResult"]/table[2]/tbody/tr/td/a[2]') # copy XPATH
    except: # ์ด์ „ ๋ฒ„ํŠผ์ด ์—†์–ด์„œ ์œ„์น˜๊ฐ€ 2๋ฒˆ์งธ๊ฐ€ ์•„๋‹ˆ๋ผ 1๋ฒˆ์งธ aํƒœ๊ทธ๋กœ ๋ณ€๊ฒฝ๋œ
        next_button=driver.find_element(By.XPATH,'//*[@id="divResult"]/table[2]/tbody/tr/td/a[1]') # copy XPATH
    next_button.click()
    time.sleep(2)

    # ๋‹ค์Œ ๋ฒ„ํŠผ ๋ˆ„๋ฅด๊ณ  ์ด๋™๋œ ํŽ˜์ด์ง€์˜ ํ˜„์žฌ url์„ ๋‹ค์‹œ url ๋ณ€์ˆ˜์— ๋„ฃ์–ด์„œ ๊ฐฑ์‹ 
    url=driver.current_url

driver.quit()

# ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์ƒ์„ฑ
# column ์ด๋ฆ„
columns=[
    '๊ตฌ๋ถ„','์—ด์ฐจ๋ฒˆํ˜ธ','์ถœ๋ฐœ์‹œ๊ฐ','๋„์ฐฉ์‹œ๊ฐ','ํŠน์‹ค/์šฐ๋“ฑ์‹ค','์ผ๋ฐ˜์‹ค','์œ ์•„',
    '์ž์œ ์„/์ž…์„','์ธํ„ฐ๋„ทํŠน๊ฐ€(๋ฉค๋ฒ„์‹ญํ˜œํƒ)','์˜ˆ์•ฝ๋Œ€๊ธฐ','์ •์ฐจ์—ญ(๊ฒฝ์œ )','์ฐจ๋Ÿ‰์œ ํ˜•/ํŽธ์„ฑ์ •๋ณด','์šด์ž„์š”๊ธˆ','์†Œ์š”์‹œ๊ฐ„','url'
]

df=pd.DataFrame(data_rows,columns=columns)
df

์ด๋ ‡๊ฒŒ ์ •์ƒ์ ์œผ๋กœ ํฌ๋กค๋ง ๋ฉ๋‹ˆ๋‹ค. 

ํ•˜์ง€๋งŒ 9,10๋ฒˆ ์ธ๋ฑ์Šค์˜ ๋‚ด์šฉ์ด ๊ฒน์น˜๊ฒŒ ๋ฉ๋‹ˆ๋‹ค.

๊ทธ ์ด์œ ๋Š” ์›น์‚ฌ์ดํŠธ ์ž์ฒด์—์„œ ๋งˆ์ง€๋ง‰ ์˜ˆ์•ฝ๋‚ด์šฉ๊ณผ ๊ทธ ๋‹ค์ŒํŽ˜์ด์ง€ ์ฒซ ์˜ˆ์•ฝ๋‚ด์šฉ์ด ๋™์ผํ•˜๊ธฐ ๋•Œ๋ฌธ์ž…๋‹ˆ๋‹ค.

 

 

df=df.drop_duplicates(subset=['์ถœ๋ฐœ์‹œ๊ฐ'],keep='first')
df

ํ•ด๋‹น ์ฝ”๋“œ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ถœ๋ฐœ์‹œ๊ฐ์ด ๊ฒน์น˜๋Š” ๊ฒƒ์ค‘ ์ฒซ๋ฒˆ์งธ๊ป€๋งŒ ๋‚จ๊ธฐ๊ณ  ๋‚˜๋จธ์ง€๋Š” ๋‹ค ์ œ๊ฑฐ๋ฅผ ํ•ด์ค๋‹ˆ๋‹ค.

 

 

fast_reserve_df=df[df['์ผ๋ฐ˜์‹ค']=='์˜ˆ์•ฝํ•˜๊ธฐ']
# ์ธ๋ฑ์Šค ๋ฆฌ์…‹
fast_reserve_df=fast_reserve_df.reset_index(drop=True)
fast_reserve_df.head()

fast_reserve_df.loc[0,'url']

์ž…์„+์ขŒ์„ ์œผ๋กœ ๋˜์–ด์žˆ๋Š” ๋ฌถ์Œ ์ƒํ’ˆ์„ ์ œ์™ธํ•˜๊ณ 

์ž…์„์œผ๋กœ ๊ฐˆ ์ˆ˜ ์žˆ๊ณ  ์˜ˆ์•ฝ์ด ๊ฐ€๋Šฅํ•œ ๊ฐ€์žฅ ๋น ๋ฅธ ์ƒํ’ˆ์„ ์ถ”์ถœํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

๋งŒ์•ฝ ๊ธฐ์ฐจ๋ฅผ ์ž์ฃผํƒ€๊ณ  ์˜ˆ์•ฝ์„ ํ•ด์•ผํ•˜๋Š” ์ƒํ™ฉ์ด๊ณ  ํฌ๋กค๋ง์„ ํ†ตํ•ด ์ฝ”๋“œ๋ฅผ ์ž๋™ํ™” ํ•ด๋‘”๋‹ค๋ฉด

ํ•ด๋‹น ์˜ˆ์•ฝ ํŽ˜์ด์ง€๋กœ ๋ฐ”๋กœ ๊ฐˆ ์ˆ˜ ์žˆ๊ธฐ์— ํšจ์œจ์ ์ด๊ฒŒ ๋ฉ๋‹ˆ๋‹ค!