聚合积累内部对象 [英] Aggregation Accumulate Inner Objects

查看:74
本文介绍了聚合积累内部对象的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我是mongo聚合的新手,我需要帮助创建一个,

I'm new with mongo aggregations, and I need a help with creating one,

我有一个以下文档的集合作为示例:

I have a collection of the following document as an example:

{
    "_id" : ObjectId("5afc2f06e1da131c9802071e"),
    "_class" : "Traveler",
    "name" : "John Due",
    "startTimestamp" : 1526476550933,
    "endTimestamp" : 1526476554823,
    "source" : "istanbul",
    "cities" : [ 
        {
            "_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
            "name" : "Moscow",
            "timestamp" : 1526476550940,
            "timeSpent" : 3180
        },

        {
            "_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
            "name" : "Cairo",
            "timestamp" : 1625476550940,
            "timeSpent" : 318000,
        },
         {
            "_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
            "name" : "Moscow",
            "timestamp" : 15211276550940,
            "timeSpent" : 318011
        }


    ],
    "variables" : [ 
        {
            "_id" : "cd4318a83c9b-a8478d76bfd3e4b6-5967",
            "name" : "Customer Profile",
            "lastValue" : "",
            "values" : [],
            "additionalData" : {}
        }, 
        {
            "_id" : "366cb8c07996-c62c37a87a86d526-d3e7",
            "name" : "Target Telephony Queue",
            "lastValue" : "",
            "values" : [],
            "additionalData" : {}
        }, 
        {
            "_id" : "4ed84742da33-d70ba8a809b712f3-bdf4",
            "name" : "IMEI",
            "lastValue" : "",
            "values" : [],
            "additionalData" : {}
        }, 

        {
            "_id" : "c8103687c1c8-97d749e349d785c8-9154",
            "name" : "Budget",
            "defaultValue" : "",
            "lastValue" : "",
            "values" : [ 
                {
                    "value" : "3000",
                    "timestamp" : NumberLong(1526476550940),
                    "element" : "c8103687c1c8-97d749e349d785c8-9154"
                }
            ],
            "additionalData" : {}
        }
    ]
}

我需要一个结果文件显示每个该集合中的每个旅行者都访问过该城市,并且平均预算(预算是变量数组中的一个元素

I need to have a resulting document showing the how many times each city have been visited by each traveler in the collection, and the average budget (budget is an element in the variables array

所以生成的文档将类似于:

so the resulting document will be similar to:

{
     "_id" : ObjectId("5afc2f06e1da131c9802071e"),
     "_class" : "Traveler",
     "name" : "John Due",
     "startTimestamp" : 1526476550933,
     "endTimestamp" : 1526476554823,
     "source" : "istanbul",
    "cities" : [ 
        {
            "_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
            "name" : "Moscow",
            "visited":2
        },
        {
            "_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
            "name" : "Cairo",
            "visited":1
        }
    ],
    "variables" : [ 
        {
                "_id" : "c8103687c1c8-97d749e349d785c8-9154",
                "name" : "Budget",
                "defaultValue" : "",
                "lastValue" : "",
                "values" : [ 
                    {
                        "value" : "3000",
                    }
                ],
            }
    ],
}

谢谢你你的帮助

推荐答案

快速说明,你需要更改valuevalues中的$ c>字段为数字,因为它是presen一根绳子。但回答:

As a quick note, you need to change your "value" field inside the "values" to be numeric, since it's presently a string. But on to the answer:

如果您有权访问 $ reduce ,那么你实际上可以这样做:

If you have access to $reduce from MongoDB 3.4, then you can actually do something like this:

db.collection.aggregate([
  { "$addFields": {
     "cities": {
       "$reduce": {
         "input": "$cities",
         "initialValue": [],
         "in": {
           "$cond": {
             "if": { "$ne": [{ "$indexOfArray": ["$$value._id", "$$this._id"] }, -1] },
             "then": {
               "$concatArrays": [
                 { "$filter": {
                   "input": "$$value",
                   "as": "v",
                   "cond": { "$ne": [ "$$this._id", "$$v._id" ] }
                 }},
                 [{
                   "_id": "$$this._id",
                   "name": "$$this.name",
                   "visited": {
                     "$add": [
                       { "$arrayElemAt": [
                         "$$value.visited",
                         { "$indexOfArray": [ "$$value._id", "$$this._id" ] }
                       ]},
                       1
                     ]
                   }
                 }]
               ]
             },
             "else": {
               "$concatArrays": [
                 "$$value",
                 [{
                   "_id": "$$this._id",
                   "name": "$$this.name",
                   "visited": 1
                 }]
               ]
             }
           }
         }
       }
     },
     "variables": {
       "$map": {
         "input": {
           "$filter": {
             "input": "$variables",
             "cond": { "$eq": ["$$this.name", "Budget"] } 
           }
         },
         "in": {
           "_id": "$$this._id",
           "name": "$$this.name",
           "defaultValue": "$$this.defaultValue",
           "lastValue": "$$this.lastValue",
           "value": { "$avg": "$$this.values.value" }
         }
       }
     }
  }}
])

如果您有MongoDB 3.6,可以使用 $ mergeObjects

If you have MongoDB 3.6, you can clean that up a bit with $mergeObjects:

db.collection.aggregate([
  { "$addFields": {
     "cities": {
       "$reduce": {
         "input": "$cities",
         "initialValue": [],
         "in": {
           "$cond": {
             "if": { "$ne": [{ "$indexOfArray": ["$$value._id", "$$this._id"] }, -1] },
             "then": {
               "$concatArrays": [
                 { "$filter": {
                   "input": "$$value",
                   "as": "v",
                   "cond": { "$ne": [ "$$this._id", "$$v._id" ] }
                 }},
                 [{
                   "_id": "$$this._id",
                   "name": "$$this.name",
                   "visited": {
                     "$add": [
                       { "$arrayElemAt": [
                         "$$value.visited",
                         { "$indexOfArray": [ "$$value._id", "$$this._id" ] }
                       ]},
                       1
                     ]
                   }
                 }]
               ]
             },
             "else": {
               "$concatArrays": [
                 "$$value",
                 [{
                   "_id": "$$this._id",
                   "name": "$$this.name",
                   "visited": 1
                 }]
               ]
             }
           }
         }
       }
     },
     "variables": {
       "$map": {
         "input": {
           "$filter": {
             "input": "$variables",
             "cond": { "$eq": ["$$this.name", "Budget"] } 
           }
         },
         "in": {
           "$mergeObjects": [
             "$$this",
             { "values": { "$avg": "$$this.values.value" } }
           ]
         }
       }
     }
  }}
])

但它是或多或少相同的事情,除了我们保持 additionalData

But it's more or less the same thing except we keep the additionalData

在此之前稍微回过头来,你可以随时 $ unwind cities累积:

Going back a little before that, then you can always $unwind the "cities" to accumulate:

db.collection.aggregate([
  { "$unwind": "$cities" },
  { "$group": {
     "_id": { 
       "_id": "$_id",
       "cities": {
         "_id": "$cities._id",
         "name": "$cities.name"
       }
     },
     "_class": { "$first": "$class" },
     "name": { "$first": "$name" },
     "startTimestamp": { "$first": "$startTimestamp" },
     "endTimestamp" : { "$first": "$endTimestamp" },
     "source" : { "$first": "$source" },
     "variables": { "$first": "$variables" },
     "visited": { "$sum": 1 }
  }},
  { "$group": {
     "_id": "$_id._id",
     "_class": { "$first": "$class" },
     "name": { "$first": "$name" },
     "startTimestamp": { "$first": "$startTimestamp" },
     "endTimestamp" : { "$first": "$endTimestamp" },
     "source" : { "$first": "$source" },
     "cities": {
       "$push": {
         "_id": "$_id.cities._id",
         "name": "$_id.cities.name",
         "visited": "$visited"
       }
     },
     "variables": { "$first": "$variables" },
  }},
  { "$addFields": {
     "variables": {
       "$map": {
         "input": {
           "$filter": {
             "input": "$variables",
             "cond": { "$eq": ["$$this.name", "Budget"] } 
           }
         },
         "in": {
           "_id": "$$this._id",
           "name": "$$this.name",
           "defaultValue": "$$this.defaultValue",
           "lastValue": "$$this.lastValue",
           "value": { "$avg": "$$this.values.value" }
         }
       }
     }
  }}
])

所有回报(几乎)都是一样的:

All return (almost) the same thing:

{
        "_id" : ObjectId("5afc2f06e1da131c9802071e"),
        "_class" : "Traveler",
        "name" : "John Due",
        "startTimestamp" : 1526476550933,
        "endTimestamp" : 1526476554823,
        "source" : "istanbul",
        "cities" : [
                {
                        "_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
                        "name" : "Cairo",
                        "visited" : 1
                },
                {
                        "_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
                        "name" : "Moscow",
                        "visited" : 2
                }
        ],
        "variables" : [
                {
                        "_id" : "c8103687c1c8-97d749e349d785c8-9154",
                        "name" : "Budget",
                        "defaultValue" : "",
                        "lastValue" : "",
                        "value" : 3000
                }
        ]
}

前两种形式是当然,这是最理想的事情,因为他们只是在同一个文档内工作。

The first two forms are of course the most optimal thing to do since they are simply working "within" the same document at all times.

$ reduce 允许数组上的累积表达式,所以我们可以在这里使用它来保持一个简化数组,我们使用_ id值/ manual / reference / operator / aggregation / indexOfArray /rel =nofollow noreferrer> $ indexOfArray 以查看是否已有累积项目匹配。 -1 的结果意味着它不存在。

Operators like $reduce allow "accumulation" expressions on arrays, so we can use it here to keep a "reduced" array which we test for the unique "_id" value using $indexOfArray in order to see if the there already is an accumulated item that matches. A result of -1 means it's not there.

为了构造一个简化数组,我们采取initialValue [] 作为一个空数组,然后通过 $ concatArrays 。所有这一过程都是通过三元 决定的。 $ cond 运营商认为if条件和然后加入 $ filter的输出 当前 $$值,以排除当前索引 _id 条目当然还有另一个表示单个对象的数组。

In order construct a "reduced array" we take the "initialValue" of [] as an empty array and then add to it via $concatArrays. All of that process is decided via the "ternary" $cond operator which considers the "if" condition and "then" either "joins" the output of the $filter on the current $$value to exclude the current index _id entry, with of course another "array" representing the singular object.

对于那个对象,我们再次使用 $ indexOfArray 实际获得匹配的索引,因为我们知道itemis there,并使用它通过visited值/参考/操作者/ aggrega / dataElemAt /rel =nofollow noreferrer> $ arrayElemAt $ add 以便递增。

For that "object" we again use the $indexOfArray to actually get the matched index since we know that the item "is there", and use that to extract the current "visited" value from that entry via $arrayElemAt and $add to it in order to increment.

else的情况下,我们只需添加一个数组作为对象,它只有一个默认的访问过 1 的值。使用这两种情况有效地在数组中累积唯一值以输出。

In the "else" case we simply add an "array" as an "object" which just has a default "visited" value of 1. Using both those cases effectively accumulates unique values within the array to output.

在后一版本中,我们只是 $ unwind 数组并使用连续的 $ group 为了第一阶段对独特的内部条目进行计数,然后将重新构建数组转换为类似的形式。

In the latter version, we just $unwind the array and use successive $group stages in order to first "count" on the unique inner entries, and then "re-construct the array" into the similar form.

使用 $ unwind 看起来更简单,但是因为它是什么实际上是为每个数组条目获取文档的副本,然后这实际上增加了处理的相当大的开销。在现代版本中,通常有数组运算符,这意味着除非您的意图是跨文档累积,否则不需要使用它。因此,如果您确实需要 $ group 来自数组内部的键的值,然后就是你实际需要使用它的地方。

Using $unwind looks far more simple, but since what it actually does is take a copy of the document for every array entry, then this actually adds considerable overhead to processing. In modern versions there are generally array operators which mean you don't need to use this unless your intention is to "accumulate across documents". So if you actually need to $group on a value of a key from "inside" an array, then that is where you actually do need to use it.

至于变量然后我们可以简单地使用 $ filter 此处再次获取匹配的预算条目。我们这样做是作为 $ map <的输入/ code> 运算符允许重新整形数组内容。我们主要想要这样你可以获取values的内容(一旦你把它全部数字化)并使用 $ avg 运算符,提供字段路径符号直接形成数组值,因为它实际上可以从这样的输入返回一个结果。

As for the "variables" then we can simply use the $filter again here to get the matching "Budget" entry. We do this as the input to the $map operator which allows "re-shaping" of the array content. We mainly want that so you can take the content of the "values" ( once you make it all numeric ) and use the $avg operator, which is supplied that "field path notation" form directly to the array values because it can in fact return a result from such an input.

这通常会导致几乎所有主要数组的数组运算符用于聚合管道(不包括集合运算符),所有这些都在一个管道阶段内。

That generally makes the tour of pretty much ALL of the main "array operators" for the aggregation pipeline ( excluding the "set" operators ) all within a single pipeline stage.

也不要忘记你总是想要< a href =https://docs.mongodb.com/manual/reference/operator/aggregation/match/ =nofollow noreferrer> $ match 将常规查询运营商列为第一阶段 任何聚合管道只需选择您需要的文件。理想情况下使用索引。

Also don't ever forget that you just about always want to $match with regular Query Operators as the "very first stage" of any aggregation pipeline in order to just select the documents you need. Ideally using an index.

备选工作正常通过客户端代码中的文档。通常不建议使用它,因为上面的所有方法都显示它们实际上减少从服务器返回的内容,通常是服务器聚合。

Alternates are working through the documents in client code. It generally would not be recommended since all methods above show they actually "reduce" the content as returned from the server, as is generally the point of "server aggregations".

可能由于基于文档的性质,使用 $ unwind 可能需要更长的时间,而客户端处理可能是一个选项,但我会考虑它更有可能

It "may" be possible due to the "document based" nature that larger result sets may take considerably more time using $unwind and client processing could be an option, but I would consider it far more likely

下面是一个列表,演示如何将结果应用于游标流,同时返回结果。有三个演示版本的转换,显示完全与上面相同的逻辑,一个实现与 lodash 积累的方法,以及<$的自然积累c $ c>地图实施:

Below is a listing that demonstrates applying a transform to the cursor stream as results are returned doing the same thing. There are three demonstrated versions of the transform, showing "exactly" the same logic as above, a implementation with lodash methods for accumulation, and a "natural" accumulation on the Map implementation:

const { MongoClient } = require('mongodb');
const { chain } = require('lodash');

const uri = 'mongodb://localhost:27017';
const opts = { useNewUrlParser: true };

const log = data => console.log(JSON.stringify(data, undefined, 2));

const transform = ({ cities, variables, ...d }) => ({
  ...d,
  cities: cities.reduce((o,{ _id, name }) =>
    (o.map(i => i._id).indexOf(_id) != -1)
      ? [
          ...o.filter(i => i._id != _id),
          { _id, name, visited: o.find(e => e._id === _id).visited + 1 }
        ]
      : [ ...o, { _id, name, visited: 1 } ]
  , []).sort((a,b) => b.visited - a.visited),
  variables: variables.filter(v => v.name === "Budget")
    .map(({ values, additionalData, ...v }) => ({
      ...v,
      values: (values != undefined)
        ? values.reduce((o,e) => o + e.value, 0) / values.length
        : 0
    }))
});

const alternate = ({ cities, variables, ...d }) => ({
  ...d,
  cities: chain(cities)
    .groupBy("_id")
    .toPairs()
    .map(([k,v]) =>
      ({
        ...v.reduce((o,{ _id, name }) => ({ ...o, _id, name }),{}),
        visited: v.length
      })
    )
    .sort((a,b) => b.visited - a.visited)
    .value(),
  variables: variables.filter(v => v.name === "Budget")
    .map(({ values, additionalData, ...v }) => ({
      ...v,
      values: (values != undefined)
        ? values.reduce((o,e) => o + e.value, 0) / values.length
        : 0
    }))

});

const natural = ({ cities, variables, ...d }) => ({
  ...d,
  cities: [
    ...cities
      .reduce((o,{ _id, name }) => o.set(_id,
        [ ...(o.has(_id) ? o.get(_id) : []), { _id, name } ]), new Map())
      .entries()
  ]
  .map(([k,v]) =>
    ({
      ...v.reduce((o,{ _id, name }) => ({ ...o, _id, name }),{}),
      visited: v.length
    })
  )
  .sort((a,b) => b.visited - a.visited),
  variables: variables.filter(v => v.name === "Budget")
    .map(({ values, additionalData, ...v }) => ({
      ...v,
      values: (values != undefined)
        ? values.reduce((o,e) => o + e.value, 0) / values.length
        : 0
    }))

});

(async function() {

  try {

    const client = await MongoClient.connect(uri, opts);

    let db = client.db('test');
    let coll = db.collection('junk');

    let cursor = coll.find().map(natural);

    while (await cursor.hasNext()) {
      let doc = await cursor.next();
      log(doc);
    }

    client.close();

  } catch(e) {
    console.error(e)
  } finally {
    process.exit()
  }

})()

这篇关于聚合积累内部对象的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆