Naver Crawler:每个循环Python组合DataFrame [英] Naver Crawler: Combining DataFrame per each loop Python

查看:118
本文介绍了Naver Crawler:每个循环Python组合DataFrame的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在我的Naver Crawler(它是韩国Google:P)上工作.我已经在这个代码上工作了一个星期,而我还有最后一个任务要解决!因此,下面的代码显示了通过Naver API进行数据爬网并在每个循环中将数据接收到"js".我需要做的就是合并每个数据帧(dfdfdf)并在底部合并.但是我的结果总是显示最后循环的数据.最重要的是,我想为我正在执行的每个循环添加DataFrame. 我尝试合并,加入,但似乎无法正常工作.请让我知道,如果下面的代码没有意义(或太脏),请告诉我!

I am working on my Naver Crawler (its a Korea Google :P). I have working on this code for a week now, and I have one last task to solve! So my code below shows Data Crawling through Naver API and receiving data to "js" in each loop. All I need to do is combine each dataframe (dfdfdf) and combine at the bottom. But my result always shows the last looped data. Bottom line is that I want to add DataFrame for each loop that I am taking. I tried merge, join but it seems to be not working. Please let me know and if my code below does not make sense (or too dirty) let me know!

import os
import sys
import urllib.request
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta
import time

ex = pd.ExcelFile('mat_hierarchy.xlsx').parse('Sheet1') 
DNA1 = []
#adding list to DNA
DNA1.extend(ex.iloc[:,3])
DNA1.extend(ex.iloc[:,2])
seen = set()
DNA = []
for item in DNA1:
    if item not in seen:
        seen.add(item)
        DNA.append(item)

# len(DNA)

#Setting Date weekly or daily
#dd = pd.date_range('2016-01-01',datetime.now().date() - timedelta(2))
dd = pd.date_range(start = '2016-01-01',end = datetime.now().date() - timedelta(2), freq = 'W-MON')

setendDate = datetime.now().date() - timedelta(1)
endDate = setendDate.strftime('%Y-%m-%d') 

#Setting DataFrame & List
Data = pd.DataFrame(index=dd)

#Naver API Connection 
client_id = "ID"
client_secret = "PW"
url = "https://openapi.naver.com/v1/datalab/search";

#Setting requests
body_intro = "{\"startDate\":\"2016-01-01\",\"endDate\":\""
body_endDate = "\",\"timeUnit\":\"date\",\"keywordGroups\":[{\"groupName\":\""
body_keywords = "\",\"keywords\":[\""
body_groupName = "\"]},{\"groupName\":\""
body_last = "\"]}],\"ages\":[\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\"]}"

df_list=[]

for i in range(2270,len(DNA),5):
    if((len(DNA)%5==0) or (i < (len(DNA)-(len(DNA)%5)))):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + body_keywords + DNA[i+4] + body_last    
        print("5")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==4):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_last    
        print("4")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==3):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_last    
        print("3")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==2):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_last    
        print("2")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    else:
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_last 
        print("1")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    #Combining all Data
    #Naver = Data.join(dfdfdf) 
    print("end")
    time.sleep(.5)
Final = pd.concat(df_list, axis=1)
Final.to_csv("Naver123.csv")

推荐答案

考虑使用在for循环之外串联的数据帧列表.各个循环运行水平合并,而最终的主合并运行垂直附加.

Consider using a list of dataframes that are concatenated outside of for loop. Whereas the individual loops run horizontal merge, the final master combine runs a vertical append.

另外,对于DRY-er解决方案,请考虑使用运行对数据帧的响应的已定义方法,将 body 变量作为参数传入,这是if块之间的唯一区别. /p>

Also, for a DRY-er solution, consider using a defined method that runs the response to dataframe, passing in as a parameter the body variable, the only difference between if blocks.

...
def response_to_df(body):
   request = urllib.request.Request(url)
   request.add_header("X-Naver-Client-Id",client_id)
   request.add_header("X-Naver-Client-Secret",client_secret)
   request.add_header("Content-Type","application/json")
   response = urllib.request.urlopen(request, data=body.encode("utf-8"))
   rescode = response.getcode()
   if(rescode==200):
       response_body = response.read()
       js = response_body.decode('utf-8')
    else:
       print("Error Code:" + rescode)
    d = json.loads(js)
    lst = [pd.DataFrame.from_dict(r['data']).set_index('period')\
                                  .rename(columns={'ratio' : r['title']})
           for r in d['results']]

    # HORIZONTAL MERGE
    df = pd.concat(lst, axis=1)
    df = Data.join(df)
    return df


df_list = []
for i in range(len(DNA), 5):
    if((len(DNA) % 5==0) or (i < (len(DNA) - (len(DNA) % 5)))):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \
               body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + \
               body_keywords + DNA[i+4] + body_last    
        print("5")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 4):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \
               body_keywords + DNA[i+3] + body_last    
        print("4")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 3):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_last    
        print("3")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 2):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_last    
        print("2")

        tmp = response_to_df(body)
        df_list.append(tmp) 

    else:
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_last 
        print("1")

        tmp = response_to_df(body)
        df_list.append(tmp)


# Combining all Data (VERTICAL APPEND)
Naver = pd.concat(df_list, axis=0)
print("ddd")
Naver

这篇关于Naver Crawler:每个循环Python组合DataFrame的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆