다음 랭킹 뉴스 크롤링

다음의 랭킹 뉴스 페이지는 "많이 본 순", "열독률 높은 순", "댓글 많은 순", "연령, 성별"로 정리되어 있다. 각각의 랭킹뉴스 페이지의 url은 다음과 같은 형식으로 이루어져 있다.

많이 본 순 : "https://news.daum.net/ranking/popular?regDate=" + str(date)
열독률 높은 순 : "https://news.daum.net/ranking/kkomkkom?regDate=" + str(date)
댓글 많은 순 : "https://news.daum.net/ranking/bestreply?regDate=" + str(date)
연령, 성별 : "https://news.daum.net/ranking/age?regDate=" + str(date)

기본적으로 각각의 tag에 있는 랭킹 뉴스들은 동일한 형식을 띄지 않고 있다. popular tag에 있는 뉴스는 순위가 50위 까지 나와 있고, kkomkkom tag에는 30위까지 나와있으며, bestreply tag에는 뉴스 뿐만 아니라 댓글 수와 관련된 정보가 나와있고, age tag는 20대 여성부터 50대 남성까지 많이본 순으로 나열되어 있었다. 따라서 모든 tag를 한 번에 처리하기에는 무리가 있으므로 각각의 tag에 알맞게 크롤링 해오는 함수를 작성해주었다.

# 경로 및 tag 설정
import os
os.chdir(r"C:/Users/cjy89/NLP/Project_news_crawling/")
tags = ["popular", "kkomkkom", "bestreply", "age"]

import re
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup

# 많이 본 순
def popular(soup, date):
    # 랭크, 언론사, URL, 제목 가져오기
    ranking_box = soup.find_all(class_="rank_num rank_popular")
    contents_box = soup.find_all(class_="cont_thumb")
    l = []

    for num in range(50):
        d = {}
        d['Date'] = int(date)
        d['Rank'] = ranking_box[num].find(class_="screen_out").get_text()

        new_info = contents_box[num].find(class_="tit_thumb")
        d['Press'] = new_info.find(class_='info_news').get_text()
        d['URL'] = new_info.find('a')['href']
        d['Title'] = new_info.find('a').get_text()
        l.append(d)

    # 기사 요약본, 메인 텍스트 가져오기
    for link in l:
        resp = requests.get(link['URL'])
        soup = BeautifulSoup(resp.text, "html5lib")
        contents = soup.find(class_="news_view")
        link['Main Article'] = contents.find(class_="article_view").get_text()

    return pd.DataFrame(l)

# 열독률 높은 순
def kkomkkom(soup, date):
    # 랭크, 언론사, URL, 제목 가져오기
    ranking_box = soup.find_all(class_="rank_num rank_popular")
    contents_box = soup.find_all(class_="cont_thumb")
    l = []

    for num in range(30):
        d = {}
        d['Date'] = int(date)
        d['Rank'] = ranking_box[num].find(class_="screen_out").get_text()

        new_info = contents_box[num].find(class_="tit_thumb")
        d['Press'] = new_info.find(class_='info_news').get_text()
        d['URL'] = new_info.find('a')['href']
        d['Title'] = new_info.find('a').get_text()
        l.append(d)

    # 기사 요약본, 메인 텍스트 가져오기
    for link in l:
        resp = requests.get(link['URL'])
        soup = BeautifulSoup(resp.text, "html5lib")
        contents = soup.find(class_="news_view")
        link['Main Article'] = contents.find(class_="article_view").get_text()

    return pd.DataFrame(l)

# 댓글 많은 순
def bestreply(soup, date):
    # 랭크, 언론사, URL, 제목 가져오기
    ranking_box = soup.find_all(class_="rank_num")
    contents_box = soup.find_all(class_="cont_thumb")
    l = []

    for num in range(30):
        d = {}
        d['Date'] = int(date)
        d['Rank'] = ranking_box[num].find(class_="screen_out").get_text()
        d['Comment'] = ranking_box[num].find(class_="ico_news2").get_text()

        new_info = contents_box[num].find(class_="tit_thumb")
        d['Press'] = new_info.find(class_='info_news').get_text()
        d['URL'] = new_info.find('a')['href']
        d['Title'] = new_info.find('a').get_text()
        l.append(d)

    # 기사 요약본, 메인 텍스트 가져오기
    for link in l:
        resp = requests.get(link['URL'])
        soup = BeautifulSoup(resp.text, "html5lib")
        contents = soup.find(class_="news_view")
        link['Main Article'] = contents.find(class_="article_view").get_text()

    return pd.DataFrame(l)

# 나이, 성별
def age(soup, date):
    # 여성과 남성의 ranking news 모으기
    female = soup.find_all(class_="rank_female")
    male = soup.find_all(class_="rank_male")
    ranking_news = [female, male]

    df = pd.DataFrame()
    for news in ranking_news:     # female -> male
        for i in range(4):        # 20s -> 50s
            l = []
            age = news[i].find(class_="txt_news").get_text()[:3]
            sex = news[i].find(class_="txt_news").get_text()[4:]
            press_list = news[i].find_all(class_="info_news")
            news_list = news[i].find_all(class_="link_txt")

            for num in range(5):
                d = {}
                d['Date'] = int(date)
                d['Age'] = age
                d['Sex'] = sex
                d['Rank'] = num + 1
                d['Press'] = press_list[num].get_text()
                d['URL'] = news_list[num]['href']
                d['Title'] = news_list[num].get_text()
                l.append(d)

            for link in l:
                resp = requests.get(link['URL'])
                soup = BeautifulSoup(resp.text, "html5lib")
                contents = soup.find(class_="news_view")
                link['Main Article'] = contents.find(
                    class_="article_view").get_text()

            df = pd.concat([df, pd.DataFrame(l)], axis=0, ignore_index=True)

    return df

# main interface function
def get_ranking_news(date):
    total_time = 0
    for tag in tags:
        start = time.time()
        url = "https://news.daum.net/ranking/" + tag + "?regDate=" + str(date)
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text, "html5lib")
        
        if(tag == "popular"):
            df = popular(soup, date)
        elif(tag =="kkomkkom"):
            df = kkomkkom(soup, date)
        elif(tag == "bestreply"):
            df = bestreply(soup, date)
        elif(tag == "age"):
            df = age(soup, date)
        
        title = "Daum/" + tag + "/" + str(date) + "_ranking_news.csv"
        df.to_csv(title, sep=",", index=False, encoding="utf-8-sig")
        end = time.time()
        total_time += end-start
        print("Crawling " + str(date) + " " + tag + " news : ", end-start)
    print("Total time :", total_time)
    print("Average time : ", total_time/4)
    print("───────────────────")

# 21.01.01 ~ 21.01.24 크롤링
for i in range(24):
    get_ranking_news(20210101+i)

각각 tag에 따른 랭킹뉴스를 성공적으로 크롤링 해온 것을 알 수 있다. 언론사 별로 나뉘어져 있던 네이버 랭킹 뉴스와는 다르게 다음 뉴스는 언론사 별이 아닌 tag에 따라 랭킹 뉴스가 나뉘어져 있으므로 랭킹 뉴스에 올라가는 언론사 수가 다양하지 않을 수 있다. 또한, 다음 뉴스를 이용하는 사용자의 특성 및 정치적 성향에 따라 어떤 뉴스를 많이 봤는지 달라질 수 있으므로 이에 유의해야 한다.

728x90

저작자표시 비영리

'데이터 분석 & 시각화 > Crawling' 카테고리의 다른 글

네이버 랭킹 뉴스 크롤링 (1)	2021.01.28
크롤링 (5), beautifulsoup4로 네이버 기사 크롤링하기 (5)	2020.05.02
크롤링 (4), beautifulsoup4로 네이버 기사 크롤링하기 (0)	2020.05.02
크롤링(3) (0)	2020.04.15