MongoDB:将来自多个集合的数据合并成一个..如何?

我如何(在MongoDB中)将来自多个集合的数据合并到一个集合中?

我可以使用map-reduce吗?如果是,那么怎么样?

我将不胜感激,因为我是新手。

尽pipe无法实时执行此操作,但您可以多次运行map-reduce,通过使用MongoDB 1.8+ map / reduce中的“reduce”选项来合并数据(请参阅http://www.mongodb.org/)。显示/ DOCS / MapReduce#MapReduce-Outputoptions )。 你需要在这两个集合中都有一些你可以用作_id的键。

例如,假设您有一个users集合和一个comments集合,并且您希望有一个新的集合,其中包含每个评论的一些用户人口统计信息。

假设users集合具有以下字段:

  • _ID
  • 名字
  • 国家
  • 性别
  • 年龄

然后comments集合包含以下字段:

  • _ID
  • 用户名
  • 评论
  • 创build

你会做这个地图/减less:

 var mapUsers, mapComments, reduce; db.users_comments.remove(); // setup sample data - wouldn't actually use this in production db.users.remove(); db.comments.remove(); db.users.save({firstName:"Rich",lastName:"S",gender:"M",country:"CA",age:"18"}); db.users.save({firstName:"Rob",lastName:"M",gender:"M",country:"US",age:"25"}); db.users.save({firstName:"Sarah",lastName:"T",gender:"F",country:"US",age:"13"}); var users = db.users.find(); db.comments.save({userId: users[0]._id, "comment": "Hey, what's up?", created: new ISODate()}); db.comments.save({userId: users[1]._id, "comment": "Not much", created: new ISODate()}); db.comments.save({userId: users[0]._id, "comment": "Cool", created: new ISODate()}); // end sample data setup mapUsers = function() { var values = { country: this.country, gender: this.gender, age: this.age }; emit(this._id, values); }; mapComments = function() { var values = { commentId: this._id, comment: this.comment, created: this.created }; emit(this.userId, values); }; reduce = function(k, values) { var result = {}, commentFields = { "commentId": '', "comment": '', "created": '' }; values.forEach(function(value) { var field; if ("comment" in value) { if (!("comments" in result)) { result.comments = []; } result.comments.push(value); } else if ("comments" in value) { if (!("comments" in result)) { result.comments = []; } result.comments.push.apply(result.comments, value.comments); } for (field in value) { if (value.hasOwnProperty(field) && !(field in commentFields)) { result[field] = value[field]; } } }); return result; }; db.users.mapReduce(mapUsers, reduce, {"out": {"reduce": "users_comments"}}); db.comments.mapReduce(mapComments, reduce, {"out": {"reduce": "users_comments"}}); db.users_comments.find().pretty(); // see the resulting collection 

此时,您将拥有一个名为users_comments的新集合,其中包含合并的数据,现在可以使用该集合。 这些缩小的集合都有_id ,这是您在地图函数中发出的关键字,然后所有值都是value键内的子对象 – 值不在这些缩小的文档的顶层。

这是一个简单的例子。 你可以重复这个更多的集合尽可能多的,你想继续build立减less收集。 您也可以在此过程中进行数据汇总和汇总。 可能你会定义多个reduce函数,因为聚合和保留现有字段的逻辑变得更加复杂。

您还会注意到,现在每个用户都有一个文档,其中包含该用户的所有注释。 如果我们合并具有一对一关系而不是一对多关系的数据,那么它将是平坦的,您可以简单地使用这样的reduce函数:

 reduce = function(k, values) { var result = {}; values.forEach(function(value) { var field; for (field in value) { if (value.hasOwnProperty(field)) { result[field] = value[field]; } } }); return result; }; 

如果你想平整users_comments集合,所以它是每个评论的一个文档,另外运行这个:

 var map, reduce; map = function() { var debug = function(value) { var field; for (field in value) { print(field + ": " + value[field]); } }; debug(this); var that = this; if ("comments" in this.value) { this.value.comments.forEach(function(value) { emit(value.commentId, { userId: that._id, country: that.value.country, age: that.value.age, comment: value.comment, created: value.created, }); }); } }; reduce = function(k, values) { var result = {}; values.forEach(function(value) { var field; for (field in value) { if (value.hasOwnProperty(field)) { result[field] = value[field]; } } }); return result; }; db.users_comments.mapReduce(map, reduce, {"out": "comments_with_demographics"}); 

这种技术绝对不应该在飞行中执行。 它适合于cron作业或类似于定期更新合并数据的东西。 您可能需要在新集合上运行ensureIndex ,以确保针对它执行的查询能够快速运行(请记住,您的数据仍然在value键内,因此如果您要在created comments_with_demographicscomments_with_demographics created索引,将是db.comments_with_demographics.ensureIndex({"value.created": 1});

MongoDB 3.2现在允许通过$ lookup汇总阶段将来自多个集合的数据合并为一个。 作为一个实际的例子,可以说,你有关于书籍的数据分成两个不同的集合。

首先collections,称为books ,有以下数据:

 { "isbn": "978-3-16-148410-0", "title": "Some cool book", "author": "John Doe" } { "isbn": "978-3-16-148999-9", "title": "Another awesome book", "author": "Jane Roe" } 

第二个集合叫做books_selling_data ,它有以下数据:

 { "_id": ObjectId("56e31bcf76cdf52e541d9d26"), "isbn": "978-3-16-148410-0", "copies_sold": 12500 } { "_id": ObjectId("56e31ce076cdf52e541d9d28"), "isbn": "978-3-16-148999-9", "copies_sold": 720050 } { "_id": ObjectId("56e31ce076cdf52e541d9d29"), "isbn": "978-3-16-148999-9", "copies_sold": 1000 } 

合并这两个集合只是以下面的方式使用$ lookup:

 db.books.aggregate([{ $lookup: { from: "books_selling_data", localField: "isbn", foreignField: "isbn", as: "copies_sold" } }]) 

收集完毕后, books集合将如下所示:

 { "isbn": "978-3-16-148410-0", "title": "Some cool book", "author": "John Doe", "copies_sold": [ { "_id": ObjectId("56e31bcf76cdf52e541d9d26"), "isbn": "978-3-16-148410-0", "copies_sold": 12500 } ] } { "isbn": "978-3-16-148999-9", "title": "Another awesome book", "author": "Jane Roe", "copies_sold": [ { "_id": ObjectId("56e31ce076cdf52e541d9d28"), "isbn": "978-3-16-148999-9", "copies_sold": 720050 }, { "_id": ObjectId("56e31ce076cdf52e541d9d28"), "isbn": "978-3-16-148999-9", "copies_sold": 1000 } ] } 

重要的是要注意几件事情:

  1. 在这种情况下,“from”集合books_selling_data不能被books_selling_data
  2. 如上例所示,“as”字段将是一个数组。
  3. 如果$ lookup阶段的 “localField”和“foreignField”选项不存在于它们各自的集合中( $查找文档就是一个完美的例子),那么它们将被视为null。

所以,作为一个结论,如果你想巩固这两个集合,在这种情况下,有一个平面copies_sold字段的总拷贝出售,你将不得不多做一些工作,可能使用中间集合,到最后的collections。

如果没有批量插入到mongodb中,我们循环small_collection所有对象,并将它们逐个插入到big_collection

 db.small_collection.find().forEach(function(obj){ db.big_collection.insert(obj) }); 

非常基本的例子$ lookup。

 db.getCollection('users').aggregate([ { $lookup: { from: "userinfo", localField: "userId", foreignField: "userId", as: "userInfoData" } }, { $lookup: { from: "userrole", localField: "userId", foreignField: "userId", as: "userRoleData" } }, { $unwind: { path: "$userInfoData", preserveNullAndEmptyArrays: true }}, { $unwind: { path: "$userRoleData", preserveNullAndEmptyArrays: true }} ]) 

这里使用

  { $unwind: { path: "$userInfoData", preserveNullAndEmptyArrays: true }}, { $unwind: { path: "$userRoleData", preserveNullAndEmptyArrays: true }} 

代替

 { $unwind:"$userRoleData"} { $unwind:"$userRoleData"} 

因为{$ unwind:“$ userRoleData”}这将返回空或0的结果,如果找不到与$ lookup匹配的logging。

用户多个$查找多个集合在像下面的聚合

查询:

 db.getCollection('servicelocations').aggregate([ { $match: { serviceLocationId: { $in: ["36728"] } } }, { $lookup: { from: "orders", localField: "serviceLocationId", foreignField: "serviceLocationId", as: "orders" } }, { $lookup: { from: "timewindowtypes", localField: "timeWindow.timeWindowTypeId", foreignField: "timeWindowTypeId", as: "timeWindow" } }, { $lookup: { from: "servicetimetypes", localField: "serviceTimeTypeId", foreignField: "serviceTimeTypeId", as: "serviceTime" } }, { $unwind: "$orders" }, { $unwind: "$serviceTime" }, { $limit: 14 } ]) 

结果:

 { "_id" : ObjectId("59c3ac4bb7799c90ebb3279b"), "serviceLocationId" : "36728", "regionId" : 1.0, "zoneId" : "DXBZONE1", "description" : "AL HALLAB REST EMIRATES MALL", "locationPriority" : 1.0, "accountTypeId" : 1.0, "locationType" : "SERVICELOCATION", "location" : { "makani" : "", "lat" : 25.119035, "lng" : 55.198694 }, "deliveryDays" : "MTWRFSU", "timeWindow" : [ { "_id" : ObjectId("59c3b0a3b7799c90ebb32cde"), "timeWindowTypeId" : "1", "Description" : "MORNING", "timeWindow" : { "openTime" : "06:00", "closeTime" : "08:00" }, "accountId" : 1.0 }, { "_id" : ObjectId("59c3b0a3b7799c90ebb32cdf"), "timeWindowTypeId" : "1", "Description" : "MORNING", "timeWindow" : { "openTime" : "09:00", "closeTime" : "10:00" }, "accountId" : 1.0 }, { "_id" : ObjectId("59c3b0a3b7799c90ebb32ce0"), "timeWindowTypeId" : "1", "Description" : "MORNING", "timeWindow" : { "openTime" : "10:30", "closeTime" : "11:30" }, "accountId" : 1.0 } ], "address1" : "", "address2" : "", "phone" : "", "city" : "", "county" : "", "state" : "", "country" : "", "zipcode" : "", "imageUrl" : "", "contact" : { "name" : "", "email" : "" }, "status" : "ACTIVE", "createdBy" : "", "updatedBy" : "", "updateDate" : "", "accountId" : 1.0, "serviceTimeTypeId" : "1", "orders" : [ { "_id" : ObjectId("59c3b291f251c77f15790f92"), "orderId" : "AQ18O1704264", "serviceLocationId" : "36728", "orderNo" : "AQ18O1704264", "orderDate" : "18-Sep-17", "description" : "AQ18O1704264", "serviceType" : "Delivery", "orderSource" : "Import", "takenBy" : "KARIM", "plannedDeliveryDate" : ISODate("2017-08-26T00:00:00.000Z"), "plannedDeliveryTime" : "", "actualDeliveryDate" : "", "actualDeliveryTime" : "", "deliveredBy" : "", "size1" : 296.0, "size2" : 3573.355, "size3" : 240.811, "jobPriority" : 1.0, "cancelReason" : "", "cancelDate" : "", "cancelBy" : "", "reasonCode" : "", "reasonText" : "", "status" : "", "lineItems" : [ { "ItemId" : "BNWB020", "size1" : 15.0, "size2" : 78.6, "size3" : 6.0 }, { "ItemId" : "BNWB021", "size1" : 20.0, "size2" : 252.0, "size3" : 11.538 }, { "ItemId" : "BNWB023", "size1" : 15.0, "size2" : 285.0, "size3" : 16.071 }, { "ItemId" : "CPMW112", "size1" : 3.0, "size2" : 25.38, "size3" : 1.731 }, { "ItemId" : "MMGW001", "size1" : 25.0, "size2" : 464.375, "size3" : 46.875 }, { "ItemId" : "MMNB218", "size1" : 50.0, "size2" : 920.0, "size3" : 60.0 }, { "ItemId" : "MMNB219", "size1" : 50.0, "size2" : 630.0, "size3" : 40.0 }, { "ItemId" : "MMNB220", "size1" : 50.0, "size2" : 416.0, "size3" : 28.846 }, { "ItemId" : "MMNB270", "size1" : 50.0, "size2" : 262.0, "size3" : 20.0 }, { "ItemId" : "MMNB302", "size1" : 15.0, "size2" : 195.0, "size3" : 6.0 }, { "ItemId" : "MMNB373", "size1" : 3.0, "size2" : 45.0, "size3" : 3.75 } ], "accountId" : 1.0 }, { "_id" : ObjectId("59c3b291f251c77f15790f9d"), "orderId" : "AQ137O1701240", "serviceLocationId" : "36728", "orderNo" : "AQ137O1701240", "orderDate" : "18-Sep-17", "description" : "AQ137O1701240", "serviceType" : "Delivery", "orderSource" : "Import", "takenBy" : "KARIM", "plannedDeliveryDate" : ISODate("2017-08-26T00:00:00.000Z"), "plannedDeliveryTime" : "", "actualDeliveryDate" : "", "actualDeliveryTime" : "", "deliveredBy" : "", "size1" : 28.0, "size2" : 520.11, "size3" : 52.5, "jobPriority" : 1.0, "cancelReason" : "", "cancelDate" : "", "cancelBy" : "", "reasonCode" : "", "reasonText" : "", "status" : "", "lineItems" : [ { "ItemId" : "MMGW001", "size1" : 25.0, "size2" : 464.38, "size3" : 46.875 }, { "ItemId" : "MMGW001-F1", "size1" : 3.0, "size2" : 55.73, "size3" : 5.625 } ], "accountId" : 1.0 }, { "_id" : ObjectId("59c3b291f251c77f15790fd8"), "orderId" : "AQ110O1705036", "serviceLocationId" : "36728", "orderNo" : "AQ110O1705036", "orderDate" : "18-Sep-17", "description" : "AQ110O1705036", "serviceType" : "Delivery", "orderSource" : "Import", "takenBy" : "KARIM", "plannedDeliveryDate" : ISODate("2017-08-26T00:00:00.000Z"), "plannedDeliveryTime" : "", "actualDeliveryDate" : "", "actualDeliveryTime" : "", "deliveredBy" : "", "size1" : 60.0, "size2" : 1046.0, "size3" : 68.0, "jobPriority" : 1.0, "cancelReason" : "", "cancelDate" : "", "cancelBy" : "", "reasonCode" : "", "reasonText" : "", "status" : "", "lineItems" : [ { "ItemId" : "MMNB218", "size1" : 50.0, "size2" : 920.0, "size3" : 60.0 }, { "ItemId" : "MMNB219", "size1" : 10.0, "size2" : 126.0, "size3" : 8.0 } ], "accountId" : 1.0 } ], "serviceTime" : { "_id" : ObjectId("59c3b07cb7799c90ebb32cdc"), "serviceTimeTypeId" : "1", "serviceTimeType" : "nohelper", "description" : "", "fixedTime" : 30.0, "variableTime" : 0.0, "accountId" : 1.0 } } 

代码片段。 礼貌 – 堆栈溢出包括这一个多个职位。

  db.cust.drop(); db.zip.drop(); db.cust.insert({cust_id:1, zip_id: 101}); db.cust.insert({cust_id:2, zip_id: 101}); db.cust.insert({cust_id:3, zip_id: 101}); db.cust.insert({cust_id:4, zip_id: 102}); db.cust.insert({cust_id:5, zip_id: 102}); db.zip.insert({zip_id:101, zip_cd:'AAA'}); db.zip.insert({zip_id:102, zip_cd:'BBB'}); db.zip.insert({zip_id:103, zip_cd:'CCC'}); mapCust = function() { var values = { cust_id: this.cust_id }; emit(this.zip_id, values); }; mapZip = function() { var values = { zip_cd: this.zip_cd }; emit(this.zip_id, values); }; reduceCustZip = function(k, values) { var result = {}; values.forEach(function(value) { var field; if ("cust_id" in value) { if (!("cust_ids" in result)) { result.cust_ids = []; } result.cust_ids.push(value); } else { for (field in value) { if (value.hasOwnProperty(field) ) { result[field] = value[field]; } }; } }); return result; }; db.cust_zip.drop(); db.cust.mapReduce(mapCust, reduceCustZip, {"out": {"reduce": "cust_zip"}}); db.zip.mapReduce(mapZip, reduceCustZip, {"out": {"reduce": "cust_zip"}}); db.cust_zip.find(); mapCZ = function() { var that = this; if ("cust_ids" in this.value) { this.value.cust_ids.forEach(function(value) { emit(value.cust_id, { zip_id: that._id, zip_cd: that.value.zip_cd }); }); } }; reduceCZ = function(k, values) { var result = {}; values.forEach(function(value) { var field; for (field in value) { if (value.hasOwnProperty(field)) { result[field] = value[field]; } } }); return result; }; db.cust_zip_joined.drop(); db.cust_zip.mapReduce(mapCZ, reduceCZ, {"out": "cust_zip_joined"}); db.cust_zip_joined.find().pretty(); var flattenMRCollection=function(dbName,collectionName) { var collection=db.getSiblingDB(dbName)[collectionName]; var i=0; var bulk=collection.initializeUnorderedBulkOp(); collection.find({ value: { $exists: true } }).addOption(16).forEach(function(result) { print((++i)); //collection.update({_id: result._id},result.value); bulk.find({_id: result._id}).replaceOne(result.value); if(i%1000==0) { print("Executing bulk..."); bulk.execute(); bulk=collection.initializeUnorderedBulkOp(); } }); bulk.execute(); }; flattenMRCollection("mydb","cust_zip_joined"); db.cust_zip_joined.find().pretty(); 

Mongorestore具有追加数据库中已有内容的function,所以这种行为可以用于组合两个集合:

  1. mongodump collection1
  2. collection2.rename(collection1)
  3. mongorestore

还没有尝试,但它可能比map / reduce方法更快。

是的,你可以:拿我今天写的这个实用function:

 function shangMergeCol() { tcol= db.getCollection(arguments[0]); for (var i=1; i<arguments.length; i++){ scol= db.getCollection(arguments[i]); scol.find().forEach( function (d) { tcol.insert(d); } ) } } 

你可以传递给这个函数任意数量的集合,第一个将成为目标集合。 所有其余的集合都是被转移到目标资源的来源。

您必须在应用程序层中执行此操作。 如果您使用的是ORM,则可以使用注释(或类似的东西)来拉取其他集合中存在的引用。 我只和Morphia一起工作过, @Reference注解在查询时会提取被引用的实体,所以我可以避免在代码中自己做这件事。