๐Ÿ“ฆ๋ถ„์„ ํ”„๋กœ์ ํŠธ/๐Ÿš… ํฌ๋กค๋ง ๋ถ„์„

๐Ÿš… ํฌ๋กค๋ง ๋ถ„์„ ํ”„๋กœ์ ํŠธ (6) - ๊ด€๊ด‘์ƒํ’ˆ ๋ฆฌ๋ทฐ ํฌ๋กค๋ง

๋ฐ์ดํ„ฐํŒ์Šค 2024. 10. 18. 23:58

๊ด€๊ด‘์ƒํ’ˆ ๋ฆฌ๋ทฐ ํฌ๋กค๋ง ๋ถ„์„

# only selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import time
import pandas as pd

# selenium ์›น ๋“œ๋ผ์ด๋ฒ„ ์„œ๋น„์Šค ์„ค์ •, ์‹คํ–‰
service = Service(executable_path=ChromeDriverManager().install())
driver=webdriver.Chrome(service=service)

url='https://www.hanatour.com/trp/pkg/CHPC0PKG0200M200?pkgCd=AVP231241101ZEA&prePage=major-products&directSale=PM0000114930'
driver.get(url)

# ์—ฌํ–‰ํ›„๊ธฐ๋ฅผ ํด๋ฆญ click
# review_link=driver.find_element(By.XPATH,'//*[@id="sticky06-bottom"]')

# ์ข€ ๋” ์•ˆ์ „ํ•˜๊ฒŒ ๊ธฐ๋‹ค๋ฆฐ ํ›„ ํฌ๋กค๋ง
review_link=WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="sticky06-bottom"]')))

review_link.click()
time.sleep(2)

# ๊ฐ ๋ฆฌ๋ทฐ ์ •๋ณด๋ฅผ ์ €์žฅํ•  ๋นˆ ๋ฆฌ์ŠคํŠธ
reviews=[]

driver.quit()

์›นํŽ˜์ด์ง€๊ฐ€ ๋กœ๋“œ ๋˜๊ธฐ์ „์— ์—ฌํ–‰ํ›„๊ธฐ ๋ฒ„๋“ ์„ ํด๋ฆญํ•  ์ˆ˜ ์žˆ๊ธฐ์—

'์—ฌํ–‰ํ›„๊ธฐ' ๋ฒ„ํŠผ์ด ๋œฐ๋•Œ๊นŒ์ง€ ์•ˆ์ „ํ•˜๊ฒŒ ๊ธฐ๋‹ค๋ฆฐ ํ›„ ๋ฒ„ํŠผ์„ ๋ˆŒ๋Ÿฌ ํฌ๋กค๋ง ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค.

 

 

F12๋ฅผ ๋ˆ„๋ฅธ๋‹ค์Œ ๋ฆฌ๋ทฐ ๋งจ ์œ„์— ์žˆ๋Š”๊ฒƒ์„ ๋ˆ„๋ฅด๋ฉด ๋‹ค์Œ๊ณผ ๊ฐ™์ด ๋‚˜์˜ต๋‹ˆ๋‹ค.

li ๋ผ๋Š” ํƒœ๊ทธ๊ฐ€ ๋ถ™์€ ๊ฒƒ๋“ค์ด ์—ฌ๋Ÿฌ๊ฐœ ์žˆ์ฃ ? ์ด๊ฑธ ๋‹ค ๊ฐ€์ ธ์˜ฌ ๊ฒ๋‹ˆ๋‹ค.

 

 

 ์ด๋ฒˆ์—” copy selector๋ฅผ ๋ˆŒ๋Ÿฌ์ฃผ์„ธ์š”

 

# li ํƒœ๊ทธ๋“ค์„๊ฐ€์ ธ์˜ค๋Š” ๋ถ€๋ถ„
lis=driver.find_elements(By.CSS_SELECTOR,'ul.list_review_v2>li') # ul class="list_review_v2" ์•„๋ž˜์— ์žˆ๋Š” ํƒœ๊ทธ๋“ค์„ ๋ชจ๋‘ ๊ฐ€์ ธ์˜ค๊ธฐ

ul class="list_review_v2" ์•„๋ž˜์— ์žˆ๋Š” ํƒœ๊ทธ๋“ค์„ ๋ชจ๋‘ ๊ฐ€์ ธ์™”์Šต๋‹ˆ๋‹ค.

 

 

์ด์ œ ์ด ์ •๋ณด๋“ค์„ ๊ฐ€์ ธ์˜ฌ ๊ฒ๋‹ˆ๋‹ค.

F12๋ฅผ ๋ˆ„๋ฅด๋ฉด์„œ ์–ด๋–ป๊ฒŒ ๊ตฌ์„ฑ๋˜์–ด์žˆ๋Š”์ง€ ์ง์ ‘ ํ™•์ธํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.

๋ณ„์  ์ •๋ณด๋ฅผ ๊ฐ€์ ธ์˜ค๋ ค๋ฉด strong ํƒœ๊ทธ ๋ถ€๋ถ„์„ ๊ฐ€์ ธ์™€์•ผ ํ•ฉ๋‹ˆ๋‹ค.

 

 

์ขŒ์ธก์— ์ปค์„œ๋ฅผ ๋Œ€๊ณ  ํ•˜๋‚˜์”ฉ ์˜ฎ๊ธฐ๊ฒŒ ๋˜๋ฉด strong ํƒœ๊ทธ์˜ ์œ„์น˜๊ฐ€ ๋ฐ”๋€Œ๊ฒŒ ๋ฉ๋‹ˆ๋‹ค.

๋‹ค์Œ ํƒœ๊ทธ์˜ ์ด๋ฆ„์€ A์ด๊ธด ํ•˜์ง€๋งŒ, ํ˜„์žฌ ์ œ๊ฐ€ ์œ„์น˜ํ•œ ํƒœ๊ทธ์—์„œ ํ•˜๋‚˜ ๋” ํฐ ํ…์ŠคํŠธ๋ฅผ ์ฐพ์•„์„œ ์ž…๋ ฅํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. 

3page์˜ XPath๋ฅผ ์นดํ”ผํ•ด์˜ต๋‹ˆ๋‹ค.

 

 

 

# selenium ์›น ๋“œ๋ผ์ด๋ฒ„ ์„œ๋น„์Šค ์„ค์ •, ์‹คํ–‰
service = Service(executable_path=ChromeDriverManager().install())
driver=webdriver.Chrome(service=service)

url='https://www.hanatour.com/trp/pkg/CHPC0PKG0200M200?pkgCd=AVP231241101ZEA&prePage=major-products&directSale=PM0000114930'
driver.get(url)

# ์—ฌํ–‰ํ›„๊ธฐ๋ฅผ ํด๋ฆญ click
# review_link=driver.find_element(By.XPATH,'//*[@id="sticky06-bottom"]')

# ์ข€ ๋” ์•ˆ์ „ํ•˜๊ฒŒ ๊ธฐ๋‹ค๋ฆฐ ํ›„ ํฌ๋กค๋ง
review_link=WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="sticky06-bottom"]')))

review_link.click()
time.sleep(2)

# ๊ฐ ๋ฆฌ๋ทฐ ์ •๋ณด๋ฅผ ์ €์žฅํ•  ๋นˆ ๋ฆฌ์ŠคํŠธ
reviews=[]

for page_num in range(1,230) :# 1๋ถ€ํ„ฐ 299๊นŒ์ง€ ๋ฐ˜๋ณต
    # ํŽ˜์ด์ง€๋„ค์ด์…˜ ํ•  ์š”์†Œ ์ฐพ๊ธฐ
    # paginate_div=WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'pagenate')))
    # paginate_div=driver.find_element(By.CLASS_NAME,'pagenate')
    # ์›๋ž˜๋Š” class_name, 'pagenate'๋กœ ์ฐพ์œผ๋ ค ํ–ˆ์œผ๋‚˜ ์—˜๋ ˆ๋ฉ˜ํŠธ๋ฅผ ์ฐพ์„์ˆ˜ ์—†๋‹ค๊ณ  ๋‚˜์™€์„œ xpath๋กœ ๋ณ€๊ฒฝ
    paginate_div=driver.find_element(By.XPATH,'//*[@id="sticky06"]/div/div[4]/div[3]/div')
    # ํ˜„์žฌ ๋ช‡ ํŽ˜์ด์ง€ ํฌ๋กค๋ง ์ค‘์ธ์ง€, ์ถœ๋ ฅ
    print("ํ˜„์žฌ ํŽ˜์ด์ง€:",page_num)

    # li ํƒœ๊ทธ๋“ค์„๊ฐ€์ ธ์˜ค๋Š” ๋ถ€๋ถ„
    lis=driver.find_elements(By.CSS_SELECTOR,'ul.list_review_v2>li') # ul class="list_review_v2" ์•„๋ž˜์— ์žˆ๋Š” ํƒœ๊ทธ๋“ค์„ ๋ชจ๋‘ ๊ฐ€์ ธ์˜ค๊ธฐ
    for li in lis:
        # ํ•˜๋‚˜์˜ ๋ฆฌ๋ทฐ ์ •๋ณด๋ฅผ ์ €์žฅํ•  ๋”•์…”๋„ˆ๋ฆฌ
        review_info={}

        # rating_info์—์„œ ๊ฐ ์ •๋ณด๋ฅผ ์ถ”์ถœ
        rating_info=li.find_element(By.CLASS_NAME,'rating_info')

        # ๋ณ„์ : strong ํƒœ๊ทธ ์žˆ์œผ๋ฉด ์ถ”์ถœ, ์—†์œผ๋ฉด ""ArithmeticErrorr
        rating=rating_info.find_element(By.TAG_NAME, 'strong').text if rating_info.find_element(By.TAG_NAME,'strong') else ""
        review_info['rating']=rating
        
        #span ํƒœ๊ทธ๋“ค์˜ text๋ฅผ ์ฐจ๋ก€์ฐจ๋ก€๋กœ review_info์˜ ๊ฐ ํ•ญ๋ชฉ์— ์ €์žฅ
        spans=rating_info.find_elements(By.TAG_NAME,'span')
        review_info['user'] = spans[1].text
        review_info['category']=spans[2].text
        review_info['age']=spans[3].text
        review_info['date']=spans[-1].text

        # review_cont con ํด๋ž˜์Šค ์—์„œ ๋ฆฌ๋ทฐ ์ถ”์ถœ
        review=li.find_element(By.CLASS_NAME,'review_cont')
        review_text=review.text if review else ""
        review_info['review']=review.text

        # review_cate ํด๋ž˜์Šค์—์„œ ๋ฆฌ๋ทฐ ์นดํ…Œ๊ณ ๋ฆฌ๋ฅผ ์ถ”์ถœ
        review_cate=li.find_element(By.CLASS_NAME,'review_cate')
        review_cate_text=review_cate.text if review_cate else ""
        review_info['review_cate']=review_cate_text

        reviews.append(review_info)
    # ๋‹ค์Œ ํŽ˜์ด์ง€๋กœ ์ด๋™
    try:
        # ํŽ˜์ด์ง€๊ฐ€ 1~9, 11~19, 21~29
        if page_num % 10 != 0:
            next_link=paginate_div.find_element(By.XPATH,f"//span/a[text()='{page_num+1}']")
            # a ํƒœ๊ทธ ์•ˆ์˜ text์˜ ๋ชจ์–‘์ด, page_num์— 1์„ ๋”ํ•œ ๊ฐ’์ด aํƒœ๊ทธ์˜ text ๊ฐ’์œผ๋กœ ๋“ค์–ด์žˆ์œผ๋ฉด ๊ทธ๊ฑธ ๊ฐ€์ ธ์˜ค๊ธฐ
            # ๋‹ค์Œ ํƒœ๊ทธ์˜ ์ด๋ฆ„์€ A์ด๊ธด ํ•˜์ง€๋งŒ, ํ˜„์žฌ ๋‚ด๊ฐ€ ์œ„์น˜ํ•œ ํƒœ๊ทธ์—์„œ ํ•˜๋‚˜ ๋” ํฐ ํ…์ŠคํŠธ๋ฅผ ์ฐพ์•„์„œ ์ž…๋ ฅํ•ด์•ผ ํ•˜๊ธฐ์—
            next_link.click()
        # ํŽ˜์ด์ง€๊ฐ€ 10, 20, 30  
        else:
            next_link=paginate_div.find_element(By.XPATH,'//*[@id="sticky06"]/div/div[4]/div[3]/div/div/a[3]')
            next_link.click()
        time.sleep(2) # ๋‹ค์Œ ๋ฒ„ํŠผ ๋ˆ„๋ฅผ๋•Œ๊นŒ์ง€ ์ž ๊น ๊ธฐ๋‹ค๋ฆฌ๊ธฐ
    except:
        print("๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€์— ๋„๋‹ฌํ–ˆ์Šต๋‹ˆ๋‹ค.")
        break

#์ข…๋ฃŒ
driver.quit()

# ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์ƒ์„ฑ
df=pd.DataFrame(reviews)

 

1~229ํŽ˜์ด์ง€์˜ ๊ด€๊ด‘์ƒํ’ˆ์„ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์žˆ์—ˆ์Šต๋‹ˆ๋‹ค.

์ด์ œ ์ˆ˜์ง‘ํ•œ ์ž๋ฃŒ๋ฅผ ๊ฐ€์ง€๊ณ  ์‹œ๊ฐํ™”๋ฅผ ํ†ตํ•œ ๋ฐ์ดํ„ฐ ๋ถ„์„์„ ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค.