如何将 200 多条下载的推文附加到数据帧? [英] How to append more than 200 downloaded tweets to dataframe?

查看:31
本文介绍了如何将 200 多条下载的推文附加到数据帧?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在使用循环下载超过 twitters 速率上限的内容;但是,当我尝试附加列表时,它返回一个空数据框.

I am downloading more than twitters rate cap using a loop; however, when I try to append the list it returns an empty dataframe.

我的函数看起来像:

输入:

import pandas as pd
import numpy as np
import tweepy
from datetime import timedelta

def get_tweets(handle):
    batch_count_for_tweet_downloads = 200
    try:
        alltweets = []
        tweets = api_twitter.user_timeline(screen_name=handle,
                                           count=batch_count_for_tweet_downloads,
                                           exclude_replies=True,
                                           include_rts=False,
                                           lang="en",
                                           tweet_mode="extended")
        # ---GET MORE THAN 200 TWEETS
        alltweets.extend(tweets)
        oldest = alltweets[-1].id - 1
        oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
        print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
        while len(tweets) > 0:
            tweets = api_twitter.user_timeline(screen_name=handle, count=batch_count_for_tweet_downloads, max_id=oldest)
            alltweets.extend(tweets)
            oldest = alltweets[-1].id - 1
            print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
        #---
        df = pd.DataFrame(data=[tweets.user.screen_name for tweets in alltweets], columns=['Handle'])
        df['Tweets'] = np.array([tweets.full_text for tweets in alltweets])
        df['Date'] = np.array([tweets.created_at - timedelta(hours=4) for tweets in alltweets])
        df['Len'] = np.array([len(tweets.full_text) for tweets in alltweets])
        df['Like_count'] = np.array([tweets.favorite_count for tweets in alltweets])
        df['RT_count'] = np.array([tweets.retweet_count for tweets in alltweets])
        total_tweets.extend(alltweets)
        print(handle + " Total Tweets Extracted: {}".format(len(alltweets)))
    except:
        pass
    return df

如您所见,我需要帮助将循环合并到函数中.

As you can see I need some help merging the loop into the function.

这样做的最佳方法是什么?

What is the best way of doing this?

提前感谢您的帮助.

编辑 1:(我的代码现在的样子)

输入:

import tweepy
import pandas as pd
import numpy as np
from datetime import timedelta

handles = ['@MrML16419203', '@d00tn00t']

consumerKey = 'x'
consumerSecret = 'x'
accessToken = 'x'
accessTokenSecret = 'x'

authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)
authenticate.set_access_token(accessToken, accessTokenSecret)
api_twitter = tweepy.API(authenticate, wait_on_rate_limit=True)

total_tweets = []
def get_tweets(handle):
    batch_count_for_tweet_downloads = 200
    try:
        alltweets = []
        tweets = api_twitter.user_timeline(screen_name=handle,
                                           count=batch_count_for_tweet_downloads,
                                           exclude_replies=True,
                                           include_rts=False,
                                           lang="en",
                                           tweet_mode="extended")
        alltweets.extend(tweets)
        oldest = alltweets[-1].id - 1
        oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
        print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
        while len(tweets) > 0:
            tweets = api_twitter.user_timeline(screen_name=handle, count=batch_count_for_tweet_downloads, max_id=oldest)
            alltweets.extend(tweets)
            if len(alltweets) > 0:
                oldest = alltweets[-1].id - 1
            else:
                pass
            print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
        print('---Total Downloaded: ' + str(len(alltweets)) + ' for ' + handle + '---')
        df = pd.DataFrame(data=[tweets.user.screen_name for tweets in alltweets], columns=['Handle'])
        df['Tweets'] = np.array([tweets.full_text for tweets in alltweets])
        df['Date'] = np.array([tweets.created_at - timedelta(hours=4) for tweets in alltweets])
        df['Len'] = np.array([len(tweets.full_text) for tweets in alltweets])
        df['Like_count'] = np.array([tweets.favorite_count for tweets in alltweets])
        df['RT_count'] = np.array([tweets.retweet_count for tweets in alltweets])

        print([tweets.favorite_count for tweets in alltweets])
        print(np.array([tweets.favorite_count for tweets in alltweets]))

        total_tweets.extend(alltweets)
        print("----------Total Tweets Extracted: {}".format(df.shape[0]) + "----------")
    except:
        pass
    return df
df = pd.DataFrame()
for handle in handles:
    df_new = get_tweets(handle)
    df = pd.concat((df, df_new))
print(df)

输出:

Getting Tweets For @MrML16419203, After: 2011-03-19 07:03:53
Count: ...136 @MrML16419203 Tweets Downloaded
---Total Downloaded: 136 for @MrML16419203---
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
----------Total Tweets Extracted: 136----------
Getting Tweets For @d00tn00t, After: 2009-11-27 19:18:58
Count: ...338 @d00tn00t Tweets Downloaded
Count: ...530 @d00tn00t Tweets Downloaded
Count: ...546 @d00tn00t Tweets Downloaded
Count: ...546 @d00tn00t Tweets Downloaded
---Total Downloaded: 546 for @d00tn00t---
           Handle   Tweets                Date  Len  Like_count  RT_count
0    MrML16419203   132716 2020-09-02 02:18:28  6.0         0.0       0.0
1    MrML16419203   432881 2020-09-02 02:04:23  6.0         0.0       0.0
2    MrML16419203   973625 2020-09-02 02:04:09  6.0         0.0       0.0
3    MrML16419203  1234567 2020-09-02 01:55:10  7.0         0.0       0.0
4    MrML16419203   225865 2020-09-02 01:27:11  6.0         0.0       0.0
..            ...      ...                 ...  ...         ...       ...
541      d00tn00t      NaN                 NaT  NaN         NaN       NaN
542      d00tn00t      NaN                 NaT  NaN         NaN       NaN
543      d00tn00t      NaN                 NaT  NaN         NaN       NaN
544      d00tn00t      NaN                 NaT  NaN         NaN       NaN
545      d00tn00t      NaN                 NaT  NaN         NaN       NaN

[682 rows x 6 columns]

正如您所见,对于少于 200 条推文的句柄,数据框被填充.但是,不适用于包含超过 200 条推文的句柄.

As you can see for handles which have less than 200 tweets the dataframe gets populated. However, not for handles which contain more than 200 tweets.

推荐答案

对于任何偶然发现此问题的人,我都可以使用它:

For anyone that stumbles across this I got it to work:

def get_tweets(screen_name):
batch_count_for_tweet_downloads = 200
try:
    alltweets = []
    tweets = api_twitter.user_timeline(screen_name=screen_name,
                                       count=batch_count_for_tweet_downloads,
                                       exclude_replies=True,
                                       include_rts=False,
                                       lang="en")
    alltweets.extend(tweets)
    oldest = alltweets[-1].id - 1
    oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
    print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
    while len(tweets) > 0:
        tweets = api_twitter.user_timeline(screen_name=screen_name, count=batch_count_for_tweet_downloads,
                                           max_id=oldest)
        alltweets.extend(tweets)
        if len(alltweets) > 0:
            oldest = alltweets[-1].id - 1
        else:
            pass
        print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
    outtweets = [
        [tweet.user.screen_name, tweet.text, tweet.created_at, len(tweet.text),
         tweet.favorite_count, tweet.retweet_count] for tweet in alltweets]
    df_tweet_function = pd.DataFrame(outtweets,
                                     columns=['Handle', 'Tweets', 'Date', 'Len', 'Like_count', 'RT_count'])
    print('----------Total Downloaded: ' + str(len(alltweets)) + ' for ' + handle + '----------')
except tweepy.error.TweepError:
    pass
return df_tweet_function

df = pd.DataFrame()如果 name == 'ma​​in':用于手柄中的手柄:get_tweets(句柄)df = df.append(get_tweets(handle))print("--------------提取的推文总数:{}".format(df.shape[0]) + "-------------------")

df = pd.DataFrame() if name == 'main': for handle in handles: get_tweets(handle) df = df.append(get_tweets(handle)) print("---------------TOTAL TWEETS EXTRACTED: {}".format(df.shape[0]) + "---------------")

这篇关于如何将 200 多条下载的推文附加到数据帧?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆