Mongodb聚合$ group,限制数组的长度 [英] Mongodb aggregation $group, restrict length of array

查看:2170
本文介绍了Mongodb聚合$ group,限制数组的长度的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我要根据字段对所有文档进行分组,但限制为每个值分组的文档数。



每封邮件都有一个conversation_ID。我需要为每个conversation_ID获得10个或更少的邮件。



我可以根据以下命令进行分组,但无法弄清楚如何限制
除了分组结果之外的分组文档数
Message.aggregate({'$ group':{_ id:'$ conversation_ID',msgs:{'$ push':{msgid :'$ _ id'}}}})



如何将每个conversation_ID的msgs数组长度限制为10?

解决方案

如前所述,这不是不可能,但肯定是一个可怕的问题。



实际上,如果你的主要关注点是你的结果数组将非常大,那么你最好的方法是为每个不同的conversation_ID提交一个单独的查询,然后合并你的结果。在非常MongoDB 2.6语法,可能需要一些调整取决于你的语言实现实际上是什么:

  var results = []; 
db.messages.aggregate([
{$ group:{
_id:$ conversation_ID
}}
])forEach (doc){
db.messages.aggregate([
{$ match:{conversation_ID:doc._id}},
{$ limit:10},
{$ group:{
_id:$ conversation_ID,
msgs:{$ push:$ _id}
}}
])。forEach(function(res){
results.push(res);
});
});

但这一切都取决于这是否是你想要避免的。所以,真正的答案:






这里的第一个问题是没有功能限制被推送到数组中的项目。



第二个问题是,即使将所有项目推入数组,也不能使用 $ slice ,或聚合管道中的任何类似运算符。因此,没有现成的方法可以通过一个简单的操作从一个生成的数组中获得前10的结果。



但是你实际上可以产生一组操作切片你的分组边界。它是相当参与,例如在这里我将减少数组元素切片到六。这里的主要原因是演示的过程,并显示如何做到这一点,而不是具有破坏性的数组,不包含你想要切的总数。



给定示例文档:

  {_id:1,conversation_ID:123} 
{_id:2,conversation_ID:123}
{_id:3,conversation_ID:123}
{_id:4,conversation_ID:123}
{_id:5,conversation_ID:123}
{_id:6,conversation_ID:123}
{_id:7,conversation_ID:123 }
{_id:8,conversation_ID:123}
{_id:9,conversation_ID:123}
{_id:10,conversation_ID: 123}
{_id:11,conversation_ID:123}
{_id:12,conversation_ID:456}
{_id:13,conversation_ID :456}
{_id:14,conversation_ID:456}
{_id:15,conversation_ID:456}
{_id:16,conversation_ID :456}

你可以看到当你按条件分组时,你会得到一个数组元素和另一个与五。你想在这里做的事情减少到顶部的六而不破坏只有匹配五个元素的数组。



  db.messages.aggregate([
{$ group:{
_id:$ conversation_ID,
first:{$ first:$ _id},
msgs:{$ push:$ _id },
}},
{$ unwind:$ msgs},
{$ project:{
msgs:1,
first:1,
seen:{$ eq:[$ first,$ msgs]}
}},
{ seen:1}},
{$ group:{
_id:$ _id,
msgs:{
$ push: {
$ cond:[{$ not:$ seen},$ msgs,false]
}
},
first:{ $ first:$ first},
second:{$ first:$ msgs}
}},
{$ unwind:$ msgs },
{$ project:{
msgs:1,
first:1,
second:1,
:{$ eq:[$ second,$ msgs]}
}},
{$ sort:{seen:1}},
{$ group:{
_id:$ _id,
msgs:{
$ push:{
$ cond:[{ $ not:$ seen},$ msgs,false]
}
},
first:{$ first:$ first},
second:{$ first:$ second},
third:{$ first:$ msgs}
}},
{ $ unwind:$ msgs},
{$ project:{
msgs:1,
first:1,
second 1,
third:1,
seen:{$ eq:[$ third,$ msgs]},
}},
{$ sort:{seen:1}},
{$ group:{
_id:$ _id,
msgs:{
$ push:{
$ cond:[{$ not:$ seen},$ msgs,false]
}
},
first:{$ first:$ first},
second:{$ first:$ second},
third:{$ first :$ third},
forth:{$ first:$ msgs}
}},
{$ unwind:$ msgs
{$ project:{
msgs:1,
first:1,
second:1,
third:1 ,
forth:1,
seen:{$ eq:[$ forth,$ msgs]}
}},
{ $ sort:{seen:1}},
{$ group:{
_id:$ _id,
msgs:{
$ push:{
$ cond:[{$ not:$ seen},$ msgs,false]
}
},
first:{$ first:$ first},
second:{$ first:$ second},
third:{$ first: $ third},
forth:{$ first:$ forth},
fifth:{$ first:$ msgs}
} },
{$ unwind:$ msgs},
{$ project:{
msgs:1,
first:1,
second:1,
third:1,
forth:1,
fifth:1,
seen:{$ eq:[$ fifth,$ msgs]}
}},
{$ sort:{seen:1}},
{$ group :{
_id:$ _id,
msgs:{
$ push:{
$ cond:[{$ not: $ seen},$ msgs,false]
}
},
first:{$ first:$ first},
second :{$ first:$ second},
third:{$ first:$ third},
forth:{$ first:$ ,
fifth:{$ first:$ fifth},
sixth:{$ first:$ msgs},
}} ,
{$ project:{
first:1,
second:1,
third:1,
1,
fifth:1,
sixth:1,
pos:{$ const:[1,2,3,4,5,6]
}},
{$ unwind:$ pos},
{$ group:{
_id:$ _id,
msgs:{
$ push:{
$ cond:[
{$ eq:[$ pos,1]},
$ first,
{$ cond:[
{$ eq:[$ pos,2]},
$ second,
{ $ cond:[
{$ eq:[$ pos,3]},
$ third,
{$ cond:[
{$ eq:[$ pos,4]},
$ forth,
{$ cond:[
{$ eq:[$ pos ,5]},
$ fifth,
{$ cond:[
{$ eq:[$ pos,6]},
$ six,
false
]}
]}
]}
]}
]}
]
}
}
}},
{$ unwind:$ msgs},
{$ match:{msgs:{$ ne:false }}},
{$ group:{
_id:$ _id,
msgs:{$ push:$ msgs}
}}
])

得到数组中的最高结果,条目:

  {_id:123,msgs:[1,2,3 ,4,5,6]} 
{_id:456,msgs:[12,13,14,15]}






正如你所看到的,有趣的。



最初分组你基本上想要弹出 $ first 关闭数组结果的堆栈。为了使这个过程简化一点,我们实际上在初始操作中这样做。所以过程变成:





使用 $ cond 是为了确保未来的迭代不只是一遍又一遍地添加数组的最后一个值其中切片计数大于数组成员。



整个过程需要重复尽可能多的项目,你想切片。因为我们已经在初始分组中找到了第一个项目,这意味着对于期望的切片结果的 n-1 迭代。



最后的步骤实际上只是一个可选的例子,将所有的数据转换回数组,最终显示出来。所以真的只是有条件地推送项目或 false 回到它们的匹配位置,最后过滤出所有的 false 结束数组分别有six和five成员。



所以没有一个标准的操作符来适应这个,你不能只是 5或10或数组中的任何项。但是如果你真的要这样做,那么这是你最好的方法。






你可以使用mapReduce一起丢弃聚合框架。我将采取的方法(在合理的限制内)将有效地在服务器上有一个内存中哈希映射,并累积数组,同时使用JavaScript切片限制结果:

  db.messages.mapReduce(
function(){

if(! hasOwnProperty(this.conversation_ID)){
stash [this.conversation_ID] = [];
}

if(stash [this.conversation_ID.length< maxLen){
stash [this.conversation_ID] .push(this._id);
emit(this.conversation_ID,1);
}

},
(key,values){
return 1; //真正只想保留键
},
{
scope:{stash:{}, maxLen:10},
finalize:function(key,value){
return {msgs:stash [key]};
} :{inline:1}
}

基本上构建与发出的键匹配的内存中对象,其中数组不会超过要从结果中提取的最大大小。此外,当满足最大堆栈时,甚至不会放弃该项目。



reduce部分实际上只是减少为key单个值。所以只是为了在我们的reducer没有被调用的情况下,如果只有一个键存在一个值,那么finalize函数会将stash键映射到最终输出。



这种方法的有效性随输出的大小而变化,JavaScript的求值肯定不是很快,但可能比处理流水线中的大数组要快。



< hr>

JIRA问题投票给实际上有一个切片操作符,甚至是$ push和$ addToSet的一个限制,这将是方便。个人希望至少可以对 $ map 运算符在处理时暴露当前索引值。这将有效地允许切片和其他操作。



真的,你会想要编码,以生成所有所需的迭代。如果这里的答案获得足够的爱和/或其他时间等待,我有在tuits,然后我可以添加一些代码来演示如何做到这一点。它已经是一个相当长的响应。






代码生成管道:

  var key =$ conversation_ID; 
var val =$ _id;
var maxLen = 10;

var stack = [];
var pipe = [];
var fproj = {$ project:{pos:{$ const:[]}}};

for(var x = 1; x< = maxLen; x ++){

fproj [$ project] [+ x] = 1;
fproj [$ project] [pos] [$ const]。push(x);

var rec = {
$ cond:[{$ eq:[$ pos,x]},$+ x]
}
if(stack.length == 0){
rec [$ cond]。push(false);
} else {
lval = stack.pop();
rec [$ cond]。push(lval);
}

stack.push(rec);

if(x == 1){
pipe.push({$ group:{
_id:key,
1:{ $ first:val},
msgs:{$ push:val}
}});
} else {
pipe.push({$ unwind:$ msgs});
var proj = {
$ project:{
msgs:1
}
};

proj [$ project] [seen] = {$ eq:[$+(x-1),$ msgs]};

var grp = {
$ group:{
_id:$ _id,
msgs:{
$ push:{
$ cond:[{$ not:$ seen},$ msgs,false]
}
}
} $ b};

for(n = x; n> = 1; n--){
if(n!= x)
proj [$ project] [ + n] = 1;
grp [$ group] [+ n] =(n == x)? {$ first:$ msgs}:{$ first:$+ n};
}

pipe.push(proj);
pipe.push({$ sort:{seen:1}});
pipe.push(grp);
}
}

pipe.push(fproj);
pipe.push({$ unwind:$ pos});
pipe.push({
$ group:{
_id:$ _id,
msgs:{$ push:stack [0] }
}
});
pipe.push({$ unwind:$ msgs});
pipe.push({$ match:{msgs:{$ ne:false}}});
pipe.push({
$ group:{
_id:$ _id,
msgs:{$ push:$ msgs }
}
});

构建基本迭代方法 maxLen $ unwind $ group 的步骤。还嵌入了需要的最终投影的细节和嵌套条件语句。最后一个基本上是对这个问题采取的方法:



MongoDB的$条款保证令?


I want to group all the documents according to a field but to restrict the number of documents grouped for each value.

Each message has a conversation_ID. I need to get 10 or lesser number of messages for each conversation_ID.

I am able to group according to the following command but can't figure out how to restrict the number of grouped documents apart from slicing the results Message.aggregate({'$group':{_id:'$conversation_ID',msgs:{'$push':{msgid:'$_id'}}}})

How to limit the length of msgs array for each conversation_ID to 10?

解决方案

As stated earlier, this is not impossible but certainly a horrible problem.

Actually if your main concern is that your resulting arrays are going to be exceptionally large, then you best approach is to submit for each distinct "conversation_ID" as an individual query and then combine your results. In very MongoDB 2.6 syntax which might need some tweaking depending on what your language implementation actually is:

var results = [];
db.messages.aggregate([
    { "$group": {
        "_id": "$conversation_ID"
    }}
]).forEach(function(doc) {
    db.messages.aggregate([
        { "$match": { "conversation_ID": doc._id } },
        { "$limit": 10 },
        { "$group": {
            "_id": "$conversation_ID",
            "msgs": { "$push": "$_id" }
        }}
    ]).forEach(function(res) {
        results.push( res );
    });
});

But it all depends on whether that is what you are trying to avoid. So on to the real answer:


The first issue here is that there is no function to "limit" the number of items that are "pushed" into an array. It is certainly something we would like, but the functionality does not presently exist.

The second issue is that even when pushing all items into an array, you cannot use $slice, or any similar operator in the aggregation pipeline. So there is no present way to get just the "top 10" results from a produced array with a simple operation.

But you can actually produce a set of operations to effectively "slice" on your grouping boundaries. It is fairly involved, and for example here I will reduce the array elements "sliced" to "six" only. The main reason here is to demonstrate the process and show how to do this without being destructive with arrays that do not contain the total you want to "slice" to.

Given a sample of documents:

{ "_id" : 1, "conversation_ID" : 123 }
{ "_id" : 2, "conversation_ID" : 123 }
{ "_id" : 3, "conversation_ID" : 123 }
{ "_id" : 4, "conversation_ID" : 123 }
{ "_id" : 5, "conversation_ID" : 123 }
{ "_id" : 6, "conversation_ID" : 123 }
{ "_id" : 7, "conversation_ID" : 123 }
{ "_id" : 8, "conversation_ID" : 123 }
{ "_id" : 9, "conversation_ID" : 123 }
{ "_id" : 10, "conversation_ID" : 123 }
{ "_id" : 11, "conversation_ID" : 123 }
{ "_id" : 12, "conversation_ID" : 456 }
{ "_id" : 13, "conversation_ID" : 456 }
{ "_id" : 14, "conversation_ID" : 456 }
{ "_id" : 15, "conversation_ID" : 456 }
{ "_id" : 16, "conversation_ID" : 456 }

You can see there that when grouping by your conditions you will get one array with ten elements and another with "five". What you want to do here reduce both to the top "six" without "destroying" the array that only will match to "five" elements.

And the following query:

db.messages.aggregate([
    { "$group": {
        "_id": "$conversation_ID",
        "first": { "$first": "$_id" },
        "msgs": { "$push": "$_id" },
    }},
    { "$unwind": "$msgs" },
    { "$project": {
        "msgs": 1,
        "first": 1,
        "seen": { "$eq": [ "$first", "$msgs" ] }
    }},
    { "$sort": { "seen": 1 }},
    { "$group": {
        "_id": "$_id",
        "msgs": { 
            "$push": {
               "$cond": [ { "$not": "$seen" }, "$msgs", false ]
            }
        },
        "first": { "$first": "$first" },
        "second": { "$first": "$msgs" }
    }},
    { "$unwind": "$msgs" },
    { "$project": {
        "msgs": 1,
        "first": 1,
        "second": 1,
        "seen": { "$eq": [ "$second", "$msgs" ] }
    }},
    { "$sort": { "seen": 1 }},
    { "$group": {
        "_id": "$_id",
        "msgs": { 
            "$push": {
               "$cond": [ { "$not": "$seen" }, "$msgs", false ]
            }
        },
        "first": { "$first": "$first" },
        "second": { "$first": "$second" },
        "third": { "$first": "$msgs" }
    }},
    { "$unwind": "$msgs" },
    { "$project": {
        "msgs": 1,
        "first": 1,
        "second": 1,
        "third": 1,
        "seen": { "$eq": [ "$third", "$msgs" ] },
    }},
    { "$sort": { "seen": 1 }},
    { "$group": {
        "_id": "$_id",
        "msgs": { 
            "$push": {
               "$cond": [ { "$not": "$seen" }, "$msgs", false ]
            }
        },
        "first": { "$first": "$first" },
        "second": { "$first": "$second" },
        "third": { "$first": "$third" },
        "forth": { "$first": "$msgs" }
    }},
    { "$unwind": "$msgs" },
    { "$project": {
        "msgs": 1,
        "first": 1,
        "second": 1,
        "third": 1,
        "forth": 1,
        "seen": { "$eq": [ "$forth", "$msgs" ] }
    }},
    { "$sort": { "seen": 1 }},
    { "$group": {
        "_id": "$_id",
        "msgs": { 
            "$push": {
               "$cond": [ { "$not": "$seen" }, "$msgs", false ]
            }
        },
        "first": { "$first": "$first" },
        "second": { "$first": "$second" },
        "third": { "$first": "$third" },
        "forth": { "$first": "$forth" },
        "fifth": { "$first": "$msgs" }
    }},
    { "$unwind": "$msgs" },
    { "$project": {
        "msgs": 1,
        "first": 1,
        "second": 1,
        "third": 1,
        "forth": 1,
        "fifth": 1,
        "seen": { "$eq": [ "$fifth", "$msgs" ] }
    }},
    { "$sort": { "seen": 1 }},
    { "$group": {
        "_id": "$_id",
        "msgs": { 
            "$push": {
               "$cond": [ { "$not": "$seen" }, "$msgs", false ]
            }
        },
        "first": { "$first": "$first" },
        "second": { "$first": "$second" },
        "third": { "$first": "$third" },
        "forth": { "$first": "$forth" },
        "fifth": { "$first": "$fifth" },
        "sixth": { "$first": "$msgs" },
    }},
    { "$project": {
         "first": 1,
         "second": 1,
         "third": 1,
         "forth": 1,
         "fifth": 1,
         "sixth": 1,
         "pos": { "$const": [ 1,2,3,4,5,6 ] }
    }},
    { "$unwind": "$pos" },
    { "$group": {
        "_id": "$_id",
        "msgs": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$pos", 1 ] },
                    "$first",
                    { "$cond": [
                        { "$eq": [ "$pos", 2 ] },
                        "$second",
                        { "$cond": [
                            { "$eq": [ "$pos", 3 ] },
                            "$third",
                            { "$cond": [
                                { "$eq": [ "$pos", 4 ] },
                                "$forth",
                                { "$cond": [
                                    { "$eq": [ "$pos", 5 ] },
                                    "$fifth",
                                    { "$cond": [
                                        { "$eq": [ "$pos", 6 ] },
                                        "$sixth",
                                        false
                                    ]}
                                ]}
                            ]}
                        ]}
                    ]}
                ]
            }
        }
    }},
    { "$unwind": "$msgs" },
    { "$match": { "msgs": { "$ne": false } }},
    { "$group": {
        "_id": "$_id",
        "msgs": { "$push": "$msgs" }
    }}
])

You get the top results in the array, up to six entries:

{ "_id" : 123, "msgs" : [ 1, 2, 3, 4, 5, 6 ] }
{ "_id" : 456, "msgs" : [ 12, 13, 14, 15 ] }


As you can see here, loads of fun.

After you have initially grouped you basically want to "pop" the $first value off of the stack for the array results. To make this process simplified a little, we actually do this in the initial operation. So the process becomes:

  • $unwind the array
  • Compare to the values already seen with an $eq equality match
  • $sort the results to "float" false unseen values to the top ( this still retains order )
  • $group back again and "pop" the $first unseen value as the next member on the stack. Also this uses the $cond operator to replace "seen" values in the array stack with false to help in the evaluation.

The final action with $cond is there to make sure that future iterations are not just adding the last value of the array over and over where the "slice" count is greater than the array members.

That whole process needs to be repeated for as many items as you wish to "slice". Since we already found the "first" item in the initial grouping, that means n-1 iterations for the desired slice result.

The final steps are really just an optional illustration of converting everything back into arrays for the result as finally shown. So really just conditionally pushing items or false back by their matching position and finally "filtering" out all the false values so the end arrays have "six" and "five" members respectively.

So there is not a standard operator to accommodate this, and you cannot just "limit" the push to 5 or 10 or whatever items in the array. But if you really have to do it, then this is your best approach.


You could possibly approach this with mapReduce and forsake the aggregation framework all together. The approach I would take ( within reasonable limits ) would be to effectively have an in-memory hash-map on the server and accumulate arrays to that, while using JavaScript slice to "limit" the results:

db.messages.mapReduce(
    function () {

        if ( !stash.hasOwnProperty(this.conversation_ID) ) {
            stash[this.conversation_ID] = [];
        }

        if ( stash[this.conversation_ID.length < maxLen ) {
            stash[this.conversation_ID].push( this._id );
            emit( this.conversation_ID, 1 );
        }

    },
    function(key,values) {
        return 1;   // really just want to keep the keys
    },
    { 
        "scope": { "stash": {}, "maxLen": 10 },
        "finalize": function(key,value) {
            return { "msgs": stash[key] };                
        },
        "out": { "inline": 1 }
    }
)

So that just basically builds up the "in-memory" object matching the emitted "keys" with an array never exceeding the maximum size you want to fetch from your results. Additionally this does not even bother to "emit" the item when the maximum stack is met.

The reduce part actually does nothing other than essentially just reduce to "key" and a single value. So just in case our reducer did not get called, as would be true if only 1 value existed for a key, the finalize function takes care of mapping the "stash" keys to the final output.

The effectiveness of this varies on the size of the output, and JavaScript evaluation is certainly not fast, but possibly faster than processing large arrays in a pipeline.


Vote up the JIRA issues to actually have a "slice" operator or even a "limit" on "$push" and "$addToSet", which would both be handy. Personally hoping that at least some modification can be made to the $map operator to expose the "current index" value when processing. That would effectively allow "slicing" and other operations.

Really you would want to code this up to "generate" all of the required iterations. If the answer here gets enough love and/or other time pending that I have in tuits, then I might add some code to demonstrate how to do this. It is already a reasonably long response.


Code to generate pipeline:

var key = "$conversation_ID";
var val = "$_id";
var maxLen = 10;

var stack = [];
var pipe = [];
var fproj = { "$project": { "pos": { "$const": []  } } };

for ( var x = 1; x <= maxLen; x++ ) {

    fproj["$project"][""+x] = 1;
    fproj["$project"]["pos"]["$const"].push( x );

    var rec = {
        "$cond": [ { "$eq": [ "$pos", x ] }, "$"+x ]
    };
    if ( stack.length == 0 ) {
        rec["$cond"].push( false );
    } else {
        lval = stack.pop();
        rec["$cond"].push( lval );
    }

    stack.push( rec );

    if ( x == 1) {
        pipe.push({ "$group": {
           "_id": key,
           "1": { "$first": val },
           "msgs": { "$push": val }
        }});
    } else {
        pipe.push({ "$unwind": "$msgs" });
        var proj = {
            "$project": {
                "msgs": 1
            }
        };

        proj["$project"]["seen"] = { "$eq": [ "$"+(x-1), "$msgs" ] };

        var grp = {
            "$group": {
                "_id": "$_id",
                "msgs": {
                    "$push": {
                        "$cond": [ { "$not": "$seen" }, "$msgs", false ]
                    }
                }
            }
        };

        for ( n=x; n >= 1; n-- ) {
            if ( n != x ) 
                proj["$project"][""+n] = 1;
            grp["$group"][""+n] = ( n == x ) ? { "$first": "$msgs" } : { "$first": "$"+n };
        }

        pipe.push( proj );
        pipe.push({ "$sort": { "seen": 1 } });
        pipe.push(grp);
    }
}

pipe.push(fproj);
pipe.push({ "$unwind": "$pos" });
pipe.push({
    "$group": {
        "_id": "$_id",
        "msgs": { "$push": stack[0] }
    }
});
pipe.push({ "$unwind": "$msgs" });
pipe.push({ "$match": { "msgs": { "$ne": false } }});
pipe.push({
    "$group": {
        "_id": "$_id",
        "msgs": { "$push": "$msgs" }
    }
}); 

That builds the basic iterative approach up to maxLen with the steps from $unwind to $group. Also embedded in there are details of the final projections required and the "nested" conditional statement. The last is basically the approach taken on this question:

Does MongoDB's $in clause guarantee order?

这篇关于Mongodb聚合$ group,限制数组的长度的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆