Group most recent documents by key

query

#1

Hello, i’m super recent (as of today) to CouchBase.

I have a bucket with events. Each event looks like:

{
  "createdOn": "2018-04-30T17:25:17Z",
  "eventType": "identification",
  "payload": {
    "device": "device 1",
    "identificationDistance": "0.5",
    "ingestionID": "4472375037",
    "personIdentified": false,
    "personalID": "Bastian",
    "time": "2018-04-30T17:25:17Z"
  }
}

I’m trying to build a query to filter events, by grouping by person (payload.personalID) and return the top most recent event per group.
I searched the forums and found some similar questions, from which i deduced this query:

SELECT MAX([db1.statedb.payload.time, db1])[1]
FROM statedb AS db1 
JOIN statedb AS db2 ON KEYS META(db1).id
GROUP BY db1.payload.personalID
ORDER BY db1.payload.personalID

but i have some questions:

a) is the query correct ?

b) what does MAX([db1.statedb.payload.time, db1])[1] do exactly ? Is is calculating the max between the time and the document object ?

c) To run the query i created a primary index on statedb since i was getting errors about missing indexes.
I understand why (i think) but what indexes would i need to optimise the query ? Its taking me 10.76s for 20k documents.

The indexes i have at the moment:

[
  {
    "indexes": {
      "datastore_id": "http://127.0.0.1:8091",
      "id": "46c36c45b9980d91",
      "index_key": [
        "`eventType`",
        "(`payload`.`time`)",
        "(`payload`.`personalID`)"
      ],
      "keyspace_id": "statedb",
      "name": "idxIdentifiedPersonsByTime",
      "namespace_id": "default",
      "state": "online",
      "using": "gsi"
    }
  },
  {
    "indexes": {
      "datastore_id": "http://127.0.0.1:8091",
      "id": "1522257852e3685f",
      "index_key": [
        "(distinct ((`payload`.`personalID`)))"
      ],
      "keyspace_id": "statedb",
      "name": "idxUniquePersons",
      "namespace_id": "default",
      "state": "online",
      "using": "gsi"
    }
  },
  {
    "indexes": {
      "datastore_id": "http://127.0.0.1:8091",
      "id": "14ca0ca86fc2d2e2",
      "index_key": [
        "((`db1`.`payload`).`personalID`)"
      ],
      "keyspace_id": "statedb",
      "name": "idx1",
      "namespace_id": "default",
      "state": "online",
      "using": "gsi"
    }
  },
  {
    "indexes": {
      "datastore_id": "http://127.0.0.1:8091",
      "id": "231a41e303dc1851",
      "index_key": [
        "`eventType`"
      ],
      "keyspace_id": "statedb",
      "name": "idxEventType",
      "namespace_id": "default",
      "state": "online",
      "using": "gsi"
    }
  }
]

query plan:

{
  "plan": {
    "#operator": "Sequence",
    "~children": [
      {
        "#operator": "Sequence",
        "~children": [
          {
            "#operator": "PrimaryScan",
            "index": "statedb-primary-index",
            "keyspace": "statedb",
            "namespace": "default",
            "using": "gsi"
          },
          {
            "#operator": "Fetch",
            "as": "db1",
            "keyspace": "statedb",
            "namespace": "default"
          },
          {
            "#operator": "Join",
            "as": "db2",
            "keyspace": "statedb",
            "namespace": "default",
            "on_keys": "(meta(`db1`).`id`)"
          },
          {
            "#operator": "Parallel",
            "~child": {
              "#operator": "Sequence",
              "~children": [
                {
                  "#operator": "InitialGroup",
                  "aggregates": [
                    "max([(((`db1`.`statedb`).`payload`).`time`), `db1`])"
                  ],
                  "group_keys": [
                    "((`db1`.`payload`).`personalID`)"
                  ]
                }
              ]
            }
          },
          {
            "#operator": "IntermediateGroup",
            "aggregates": [
              "max([(((`db1`.`statedb`).`payload`).`time`), `db1`])"
            ],
            "group_keys": [
              "((`db1`.`payload`).`personalID`)"
            ]
          },
          {
            "#operator": "FinalGroup",
            "aggregates": [
              "max([(((`db1`.`statedb`).`payload`).`time`), `db1`])"
            ],
            "group_keys": [
              "((`db1`.`payload`).`personalID`)"
            ]
          },
          {
            "#operator": "Parallel",
            "~child": {
              "#operator": "Sequence",
              "~children": [
                {
                  "#operator": "InitialProject",
                  "result_terms": [
                    {
                      "expr": "(max([(((`db1`.`statedb`).`payload`).`time`), `db1`])[1])"
                    }
                  ]
                }
              ]
            }
          }
        ]
      },
      {
        "#operator": "Order",
        "sort_terms": [
          {
            "expr": "((`db1`.`payload`).`personalID`)"
          }
        ]
      },
      {
        "#operator": "FinalProject"
      }
    ]
  },
  "text": "SELECT MAX([db1.statedb.payload.time, db1])[1]\nFROM statedb AS db1 \nJOIN statedb AS db2 ON KEYS META(db1).id\nGROUP BY db1.payload.personalID\nORDER BY db1.payload.personalID"
}

Thanks!


Select non-group by fields using sql query
#2

As you never used db2 , you no need join. Also if you want only eventType = “identification”, the following is right query and index

SELECT MAX([db1.payload.time, db1])[1] AS d
FROM statedb AS db1 
WHERE db1.eventType = "identification"
GROUP BY db1.payload.personalID
ORDER BY d.payload.personalID;
CREATE INDEX ix1 ON statedb(eventType);

MAX([db1.statedb.payload.time, db1]) ==> If you look argument of MAX it is array with 0th position is db1.statedb.payload.time, 1st position is whole document.
When MAX is calculated it calculates on ARRAY, When 0th element MAX , when 0th element is equal then 1st element like that.
As MAX produces ARRAY then [1] gives 1st element which is whole document.
This is approach is hidden advantage in N1QL for both MAX, MIN

Also check this post further optimize first do get META().id for groups and then fetch whole document

The following will perform better

CREATE INDEX ix1 ON statedb( payload.time, payload.personalID ) WHERE eventType = "identification";
SELECT d.* FROM (SELECT  RAW (MAX([payload.time,META().id])[1])
                  FROM stagedb
                  WHERE eventType = "identification" AND payload.time IS NOT NULL
                  GROUP BY payload.personalID) AS q
JOIN stagedb AS d ON KEYS q
ORDER BY d.payload.personalID;

If you want on specific times you can change the predicate payload.time IS NOT NULL.
NOTE: You need to have predicate on leading index key to choose the index.