MongoDB协助提出建议 [英] MongoDB to assist with recommendations

查看:46
本文介绍了MongoDB协助提出建议的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有一个3收集模式,如下所示:

  1. 用户收藏中包含有关其朋友的信息以及每位艺术家的收听次数(权重)

    {
        user_id : 1,
        Friends : [3,5,6],
        Artists : [
            {artist_id: 10 , weight : 345},
            {artist_id: 17 , weight : 378}
        ]
    }
    

  2. 艺术家收藏模式具有有关艺术家姓名,各种用户为其分配的标签的信息.

    {
        artistID : 56,
        name : "Ed Sheeran",
        user_tag : [
           {user_id : 2, tag_id : 6},
           {user_id : 2, tag_id : 5},
           {user_id : 3, tag_id : 7}
        ]
    }
    

  3. 标签集合,其中包含有关各种标签的信息.

    {tag_id : 3, tag_value : "HipHop"}
    

我想通过使用以下规则为用户提供关于艺术家的推荐:

规则1:查找用户的朋友(而不是用户)听过的艺术家,然后按朋友的听音总数来排序.

规则2:选择用户使用的任何标签,找到不在该用户的收听列表中的所有带有该标签的艺术家,然后按唯一的收听者的数量对其进行排序.

有人可以帮我写一个查询来执行以上操作吗?

解决方案

您需要在此处做一些事情以达到最终结果,但是第一步比较简单.获取您提供的用户对象:

 var user = {
    user_id : 1,
    Friends : [3,5,6],
    Artists : [
        {artist_id: 10 , weight : 345},
        {artist_id: 17 , weight : 378}
    ]
};
 

现在假设您已经获取了该数据,然后归结为为每个朋友"找到相同的结构,并将艺术家"的数组内容过滤到一个单独的列表中.大概这里每个重量"也将被考虑在内.

这是一个简单的聚合操作,它将首先过滤出列表中已存在的给定用户的艺术家:

 var artists = user.Artists.map(function(artist) { return artist.artist_id });

User.aggregate(
    [ 
        // Find possible friends without all the same artists
        { "$match": {
            "user_id": { "$in": user.Friends },
            "Artists.artist_id": { "$nin": artists }
        }},
        // Pre-filter the artists already in the user list
        { "$project": 
            "Artists": {
                "$setDifference": [
                    { "$map": {
                        "input": "$Artists",
                        "as": "$el",
                        "in": {
                            "$cond": [
                                "$anyElementTrue": {
                                    "$map": {
                                        "input": artists,
                                        "as": "artist",
                                        "in": { "$eq": [ "$$artist", "$el.artist_id" ] }
                                    }
                                },
                                false,
                                "$$el"
                            ]
                        } 
                    }}
                    [false]
                ]
            } 
        }},
        // Unwind the reduced array
        { "$unwind": "$Artists" },
        // Group back by each artist and sum weights
        { "$group": {
            "_id": "$Artists.artist_id",
            "weight": { "$sum": "$Artists.weight" }
        }},
        // Sort the results by weight
        { "$sort": { "weight": -1 } }
    ],
    function(err,results) {
        // more to come here
    }
);
 

预过滤器"是这里唯一真正棘手的部分.您可以 $unwind 数组和 $match 过滤掉不需要的条目.即使我们想稍后对结果进行$unwind以便将它们组合起来,也可以更有效地从数组"first"中删除它们,因此扩展的余地更少.

因此,这里 $map 运算符允许检查每个用户艺术家"数组的元素,还可以与过滤后的用户"艺术家列表进行比较,以仅返回所需的详细信息. $setDifference 实际用于过滤"任何结果不作为数组内容返回,而是作为false返回.

此后,只有$unwind可以对数组中的内容进行非规范化,并且 $group ,将每个艺术家的总数汇总在一起.出于娱乐目的,我们使用$sort来显示按期望的顺序返回列表,但是在以后的步骤中不需要这样做.

这至少是其中的一部分,因为结果列表应该仅是不在用户自己列表中的其他艺术家,并按可能出现在多个朋友上的任何艺术家的总权重"进行排序. /p>

下一部分将需要来自艺术家"收藏的数据,以便将听众的数量考虑在内.猫鼬虽然具有.populate()方法,但是在寻找不同用户"计数时,您确实不希望在这里使用此方法.这意味着要实现另一种聚合方式,以便为每个艺术家获得不同的计数.

根据上一个聚合操作的结果列表,您将使用如下所示的$_id值:

 // First get just an array of artist id's
var artists = results.map(function(artist) {
    return artist._id;
});

Artist.aggregate(
    [
        // Match artists
        { "$match": {
            "artistID": { "$in": artists }
        }},
        // Project with weight for distinct users
        { "$project": {
            "_id": "$artistID",
            "weight": {
                "$multiply": [
                    { "$size": {
                        "$setUnion": [
                            { "$map": {
                                "input": "$user_tag",
                                "as": "tag",
                                "in": "$$tag.user_id"
                            }},
                            []
                        ]
                    }},
                    10
                ]
            }
        }}
    ],
    function(err,results) {
        // more later
    }
);
 

这里,技巧与$map一起完成,以对值进行类似的转换,该值被馈送到 $size 运算符来找出清单是.额外的数学运算是将该数字与先前结果中已记录的权重进行比较时赋予该数字一些含义.

当然,您需要以某种方式将所有这些组合在一起,因为目前只有两组截然不同的结果.基本过程是一个哈希表",其中将唯一的艺术家" id值用作键,并组合权重"值.

您可以通过多种方式执行此操作,但是由于希望对组合的结果进行排序",因此我更喜欢"MongoDBish",因为它遵循了您应该已经习惯的基本方法.

一种方便的实现方法是使用 nedb ,它提供了内存中"存储使用与读取和写入MongoDB集合相同的方法.

如果您需要使用实际的集合以获得较大的结果,这也可以很好地扩展,因为所有原理都相同.

  1. 第一次聚合操作会将新数据插入商店

  2. 第二次聚合更新"数据,其增量为权重"字段

作为完整的功能列表,并在 async 库的其他帮助下,它看起来像像这样:

 function GetUserRecommendations(userId,callback) {

    var async = require('async')
        DataStore = require('nedb');

    User.findOne({ "user_id": user_id},function(err,user) {
        if (err) callback(err);

        var artists = user.Artists.map(function(artist) {
            return artist.artist_id;
        });

        async.waterfall(
            [
                function(callback) {
                    var pipeline =  [ 
                        // Find possible friends without all the same artists
                        { "$match": {
                            "user_id": { "$in": user.Friends },
                            "Artists.artist_id": { "$nin": artists }
                        }},
                        // Pre-filter the artists already in the user list
                        { "$project": 
                            "Artists": {
                                "$setDifference": [
                                    { "$map": {
                                        "input": "$Artists",
                                        "as": "$el",
                                        "in": {
                                            "$cond": [
                                                "$anyElementTrue": {
                                                    "$map": {
                                                        "input": artists,
                                                        "as": "artist",
                                                        "in": { "$eq": [ "$$artist", "$el.artist_id" ] }
                                                    }
                                                },
                                                false,
                                                "$$el"
                                            ]
                                        } 
                                    }}
                                    [false]
                                ]
                            } 
                        }},
                        // Unwind the reduced array
                        { "$unwind": "$Artists" },
                        // Group back by each artist and sum weights
                        { "$group": {
                            "_id": "$Artists.artist_id",
                            "weight": { "$sum": "$Artists.weight" }
                        }},
                        // Sort the results by weight
                        { "$sort": { "weight": -1 } }
                    ];

                    User.aggregate(pipeline, function(err,results) {
                        if (err) callback(err);

                        async.each(
                            results,
                            function(result,callback) {
                                result.artist_id = result._id;
                                delete result._id;
                                DataStore.insert(result,callback);
                            },
                            function(err)
                                callback(err,results);
                            }
                        );

                    });
                },
                function(results,callback) {

                    var artists = results.map(function(artist) {
                        return artist.artist_id;  // note that we renamed this
                    });

                    var pipeline = [
                        // Match artists
                        { "$match": {
                            "artistID": { "$in": artists }
                        }},
                        // Project with weight for distinct users
                        { "$project": {
                            "_id": "$artistID",
                            "weight": {
                                "$multiply": [
                                    { "$size": {
                                        "$setUnion": [
                                            { "$map": {
                                                "input": "$user_tag",
                                                "as": "tag",
                                                "in": "$$tag.user_id"
                                            }},
                                            []
                                        ]
                                    }},
                                    10
                                ]
                            }
                        }}
                    ];

                    Artist.aggregate(pipeline,function(err,results) {
                        if (err) callback(err);
                        async.each(
                            results,
                            function(result,callback) {
                                result.artist_id = result._id;
                                delete result._id;
                                DataStore.update(
                                    { "artist_id": result.artist_id },
                                    { "$inc": { "weight": result.weight } },
                                    callback
                                );
                            },
                            function(err) {
                                callback(err);
                            }
                        );
                    });
                }
            ],
            function(err) {
                if (err) callback(err);     // callback with any errors
                // else fetch the combined results and sort to callback
                DataStore.find({}).sort({ "weight": -1 }).exec(callback);
            }
        );

    });

}
 

因此,在匹配初始源用户对象之后,将值传递到第一个聚合函数,该函数按顺序执行并使用async.waterfall传递其结果.

在此之前,尽管使用常规的.insert()语句将聚合结果添加到了DataStore中,但要小心地重命名_id字段,因为nedb除了自己生成的_id以外,别无其他价值观.每个结果都从聚合结果中插入artist_idweight属性.

然后将该列表传递到第二个聚合操作,该操作将根据不同的用户大小,为每个指定的艺术家"返回计算出的重量".对于每个艺术家,在DataStore上都有具有相同.update()语句的已更新",并增加了重量"字段.

一切顺利,最后的操作是.find()这些结果,并通过组合的"weight" .sort()将它们返回,然后将结果简单地返回到传递给函数的回调中.

因此,您将像这样使用它:

 GetUserRecommendations(1,function(err,results) {
   // results is the sorted list
});
 

它将返回当前不在该用户列表中但在其朋友列表中的所有艺术家,并按朋友收听次数的权重加上该艺术家不同用户的得分的总和进行排序.

这是您处理来自两个不同集合的数据的方法,您需要将这些集合合并为具有各种汇总详细信息的单个结果.这是多个查询和一个工作空间,也是MongoDB哲学的一部分,这种操作比将它们扔到数据库以加入"结果更好.

I have a 3 collection schema as shown below:

  1. User collection has information regarding their friends and the listening count(weight) per artist

    {
        user_id : 1,
        Friends : [3,5,6],
        Artists : [
            {artist_id: 10 , weight : 345},
            {artist_id: 17 , weight : 378}
        ]
    }
    

  2. Artist collection schema has information regarding the name of the artist, the tags given by various users to them.

    {
        artistID : 56,
        name : "Ed Sheeran",
        user_tag : [
           {user_id : 2, tag_id : 6},
           {user_id : 2, tag_id : 5},
           {user_id : 3, tag_id : 7}
        ]
    }
    

  3. Tags collection having information about the various tags.

    {tag_id : 3, tag_value : "HipHop"}
    

I want to provide a user with recommendations for artists by using the below rules:

Rule 1 : Find artists listened by the user's friends but not the user, order them by the sum of friends' listening counts.

Rule 2 : select any tag used by the user, find all artist with this tag who are not in the user's listening list, and order them by the number of unique listener.

Can anybody help me write a query to perform the above.

解决方案

You need to do a couple of things here for your end result, but the first stages are relatively simple. Take the user object you provide:

var user = {
    user_id : 1,
    Friends : [3,5,6],
    Artists : [
        {artist_id: 10 , weight : 345},
        {artist_id: 17 , weight : 378}
    ]
};

Now presuming you already have that data retrieved, then this comes down to finding the same structures for each "friend" and filtering out the array content of "Artists" into a single distinct list. Presumably each "weight" will also be considered in total here.

This is a simlple aggregation operation that will first filter out the artists already in in the list for the given user:

var artists = user.Artists.map(function(artist) { return artist.artist_id });

User.aggregate(
    [ 
        // Find possible friends without all the same artists
        { "$match": {
            "user_id": { "$in": user.Friends },
            "Artists.artist_id": { "$nin": artists }
        }},
        // Pre-filter the artists already in the user list
        { "$project": 
            "Artists": {
                "$setDifference": [
                    { "$map": {
                        "input": "$Artists",
                        "as": "$el",
                        "in": {
                            "$cond": [
                                "$anyElementTrue": {
                                    "$map": {
                                        "input": artists,
                                        "as": "artist",
                                        "in": { "$eq": [ "$$artist", "$el.artist_id" ] }
                                    }
                                },
                                false,
                                "$$el"
                            ]
                        } 
                    }}
                    [false]
                ]
            } 
        }},
        // Unwind the reduced array
        { "$unwind": "$Artists" },
        // Group back by each artist and sum weights
        { "$group": {
            "_id": "$Artists.artist_id",
            "weight": { "$sum": "$Artists.weight" }
        }},
        // Sort the results by weight
        { "$sort": { "weight": -1 } }
    ],
    function(err,results) {
        // more to come here
    }
);

The "pre-filter" is the only really tricky part here. You could just $unwind the array and $match again to filter out the entries you don't want. Even though we want to $unwind the results later in order to combine them, it works out more efficient to remove them from the array "first", so there is less to expand.

So here the $map operator allows inspection of each element of the user "Artists" array and also for comparison against the filtered "user" artists list to just return the wanted details. The $setDifference is used to actually "filter" any results that were not returned as the array content, but rather returned as false.

After that there is just the $unwind to de-normalize the content in the array and the $group to bring together a total per artist. For fun we are using $sort to show that the list is returned in desired order, but that will not be necessary at a later stage.

That is at least part of the way along here as the resulting list should only be other artists not already in the user's own list, and sorted by the summed "weight" from any artists that could possibly appear on multiple friends.

The next part is going to need data from the "artists" collection in order to take the number of listeners into account. Whilst mongoose has a .populate() method, you really don't want this here as you are looking for the "distinct user" counts. This implies another aggregation implementation in order to get those distinct counts for each artist.

Following on from the result list of the previous aggregation operation, you would use the $_id values like this:

// First get just an array of artist id's
var artists = results.map(function(artist) {
    return artist._id;
});

Artist.aggregate(
    [
        // Match artists
        { "$match": {
            "artistID": { "$in": artists }
        }},
        // Project with weight for distinct users
        { "$project": {
            "_id": "$artistID",
            "weight": {
                "$multiply": [
                    { "$size": {
                        "$setUnion": [
                            { "$map": {
                                "input": "$user_tag",
                                "as": "tag",
                                "in": "$$tag.user_id"
                            }},
                            []
                        ]
                    }},
                    10
                ]
            }
        }}
    ],
    function(err,results) {
        // more later
    }
);

Here the trick is done in aggregate with $map to do a similar transform of values which is fed to $setUnion to make them a unique list. Then the $size operator is applied to find out how big that list is. The additional math is to give that number some meaning when applied against the already recorded weights from the previous results.

Of course you need to bring all of this together somehow, as right now there are just two distinct sets of results. The basic process is a "Hash Table", where the unique "artist" id values are used as a key and the "weight" values are combined.

You can do this in a number of ways, but since there is a desire to "sort" the combined results then my prefernce would be something "MongoDBish" since it follows the basic methods you already should be used to.

A handy way to implement this is using nedb, which provides an "in memory" store that uses much of the same type of methods as used to read and write to MongoDB collections.

This also scales well if you needed to use an actual collection for large results, as all the principles remain the same.

  1. First aggregation operation inserts new data to the store

  2. Second aggregation "updates" that data an increments the "weight" field

As a complete function listing, and with some other help of the async library it would look like this:

function GetUserRecommendations(userId,callback) {

    var async = require('async')
        DataStore = require('nedb');

    User.findOne({ "user_id": user_id},function(err,user) {
        if (err) callback(err);

        var artists = user.Artists.map(function(artist) {
            return artist.artist_id;
        });

        async.waterfall(
            [
                function(callback) {
                    var pipeline =  [ 
                        // Find possible friends without all the same artists
                        { "$match": {
                            "user_id": { "$in": user.Friends },
                            "Artists.artist_id": { "$nin": artists }
                        }},
                        // Pre-filter the artists already in the user list
                        { "$project": 
                            "Artists": {
                                "$setDifference": [
                                    { "$map": {
                                        "input": "$Artists",
                                        "as": "$el",
                                        "in": {
                                            "$cond": [
                                                "$anyElementTrue": {
                                                    "$map": {
                                                        "input": artists,
                                                        "as": "artist",
                                                        "in": { "$eq": [ "$$artist", "$el.artist_id" ] }
                                                    }
                                                },
                                                false,
                                                "$$el"
                                            ]
                                        } 
                                    }}
                                    [false]
                                ]
                            } 
                        }},
                        // Unwind the reduced array
                        { "$unwind": "$Artists" },
                        // Group back by each artist and sum weights
                        { "$group": {
                            "_id": "$Artists.artist_id",
                            "weight": { "$sum": "$Artists.weight" }
                        }},
                        // Sort the results by weight
                        { "$sort": { "weight": -1 } }
                    ];

                    User.aggregate(pipeline, function(err,results) {
                        if (err) callback(err);

                        async.each(
                            results,
                            function(result,callback) {
                                result.artist_id = result._id;
                                delete result._id;
                                DataStore.insert(result,callback);
                            },
                            function(err)
                                callback(err,results);
                            }
                        );

                    });
                },
                function(results,callback) {

                    var artists = results.map(function(artist) {
                        return artist.artist_id;  // note that we renamed this
                    });

                    var pipeline = [
                        // Match artists
                        { "$match": {
                            "artistID": { "$in": artists }
                        }},
                        // Project with weight for distinct users
                        { "$project": {
                            "_id": "$artistID",
                            "weight": {
                                "$multiply": [
                                    { "$size": {
                                        "$setUnion": [
                                            { "$map": {
                                                "input": "$user_tag",
                                                "as": "tag",
                                                "in": "$$tag.user_id"
                                            }},
                                            []
                                        ]
                                    }},
                                    10
                                ]
                            }
                        }}
                    ];

                    Artist.aggregate(pipeline,function(err,results) {
                        if (err) callback(err);
                        async.each(
                            results,
                            function(result,callback) {
                                result.artist_id = result._id;
                                delete result._id;
                                DataStore.update(
                                    { "artist_id": result.artist_id },
                                    { "$inc": { "weight": result.weight } },
                                    callback
                                );
                            },
                            function(err) {
                                callback(err);
                            }
                        );
                    });
                }
            ],
            function(err) {
                if (err) callback(err);     // callback with any errors
                // else fetch the combined results and sort to callback
                DataStore.find({}).sort({ "weight": -1 }).exec(callback);
            }
        );

    });

}

So after matching the initial source user object the values are passed into the first aggregate function, which is executing in series and using async.waterfall to pass it's result.

Before that happens though the aggregation results are added to the DataStore with regular .insert() statements, taking care to rename the _id fields as nedb does not like anything other than it's own self generated _id values. Each result is inserted with artist_id and weight properties from the aggregation result.

That list is then passed to the second aggregation operation which is going to return each specified "artist" with a calculated "weight" based on the distinct user size. There are the "updated" with the same .update() statement on the DataStore for each artist and incrementing the "weight" field.

All going well, the final operation is to .find() those results and .sort() them by the combined "weight", and simply return the result to the passed in callback to the function.

So you would use it like this:

GetUserRecommendations(1,function(err,results) {
   // results is the sorted list
});

And it is going to return all of the artists not presently in that user's list but in their friends lists and ordered by the combined weights of the friend listening count plus the score from the number of distinct users of that artist.

This is how you deal with data from two different collections that you need to combine into a single result with various aggregated details. It's multiple queries and a working space, but also part of the MongoDB philosopy that such operations are better performed this way than throwing them at the database to "join" results.

这篇关于MongoDB协助提出建议的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆