Data Engineering/Crawling

[Crawling] Daum 뉴스기사 크롤링

YSY^ 2020. 8. 4. 16:10

Daum 뉴스기사

  1. 요청 url확인
  2. 요청 -> 응답 응답문자열에서 정보 추출(BS)
  • 조회할 항목들의 selector
    • title : h3.tit_view #h3의 tit_view 클래스
    • reporter : span.info_view span.txt_info: nth-child(1) #span.txt_info의 첫번째 자식 태그
    • date : span.info_view span.txt_info : nth-child(2) #span.txt_info의 두번째 자식 태그
    • content : div#harmonyContainer #div의 harmonyContainer ID
  • CF) 클래스는 h3.xxx 처럼 '.'을 사용해서 묶고 ID는 div#xxx 처럼 '#'으로 엮는다
url = "https://news.v.daum.net/v/20200722160412384"
import requests
from bs4 import BeautifulSoup



#1. request
res = requests.get(url)
if res.status_code == 200:
    soup = BeautifulSoup(res.text)
    title_tag = soup.select_one('h3.tit_view') 
    reporter_tag = soup.select_one('span.info_view span.txt_info:nth-child(1)') 
    input_date_tag = soup.select_one('span.info_view span.txt_info:nth-child(2)') 
    news_tag = soup.select_one('div#harmonyContainer')
    print(title_tag.text.strip())
    print(reporter_tag.text.strip())
    print(input_date_tag.select_one('span').text.strip()) #input_date_tag.span
    print(news_tag.text.strip())
else:
    print(url, 'fail', res.status_code)

# news_tag.text.replace('\n',' ')
import re
p = re.compile(r'\s+') #\s : 공백, enter, tab
p.sub(' ', news_tag.text)

위의 코드를 함수로 만들기

# %%writefile 'daumnews.py'

import requests
from bs4 import BeautifulSoup
import re

def get_daumnews_info(url):
    res = requests.get(url)
    if res.status_code == 200:
        soup = BeautifulSoup(res.text)
        title = soup.select_one('h3.tit_view').text.strip()
        reporter = soup.select_one('span.info_view span.txt_info:nth-child(1)').text.strip()
        try:
            input_date = soup.select_one('span.info_view span.txt_info:nth-child(2)').text.strip()
        except:
            input_date = 'empty'
        news = re.sub(r'\s+', ' ', soup.select_one('div#harmonyContainer').text)
    else:
        print(url, 'fail', res.status_code)

    return(title,reporter,input_date,news)

title, reporter, input_date, news = \
                get_daumnews_info('https://news.v.daum.net/v/20200722140504165')

다음 뉴스랭킹뉴스목록

  • URL
  • 조회할 항목들의 selector
    -공통 : ul.list_news2 dic.cout_thumb
    -제목 : strong > a : text
    -기사링크 : strong > a : href속성
    -신문사 : strong > span : text
    -기사요약 : dic.desc_thumb > span.link.txt : text
url = 'https://news.daum.net/ranking/popular'

import requests
from urllib import parse
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime


def get_daum_ranking_news(type = 'popular'):
    base_url = 'https://news.daum.net/ranking/'
    url = parse.urljoin(base_url, type)
    res = requests.get(url)
    if res.status_code == 200:
        soup = BeautifulSoup(res.text)
        news_list = soup.select('ul.list_news2 div.cont_thumb')
        content_list = [] #조회한 내용(50개뉴스)를 저장할 리스트(링크, 제목, 신문사, 요약)
        for news in news_list:
            link = news.select_one('strong > a').get('href')
            title = news.select_one('strong > a').text.strip()
            newspaper = news.select_one('strong > span').text.strip()
            summary = news.select_one('div.desc_thumb > span.link_txt').text
            content_list.append([link, title, newspaper, summary])

        filename = "{}_{}_news_list.csv".format(type, datetime.now().strftime('%Y-%m-%d'))
        df = pd.DataFrame(content_list,columns=['link','title','newspaper','summary'])
        df.to_csv(filename, index=False, encoding='UTF_8')
    else:
        raise Exception('sorry')
get_daum_ranking_news('kkomkkom')
get_daum_ranking_news('bestreply')
import pandas as pd
df = pd.read_csv('popular_2020-07-24_news_list.csv', encoding='UTF-8')
df.head()
728x90
반응형