N1QL vs MapReduce Views with Sync-Gateway - Performance issues


#1

Using Couchbase EE 4.5 with the SGW 1.2.1

Testing views vs N1QL, I see performance issue with N1QL, and need help.
The basic view is getting all docs for a certain type.

function (doc, meta) {
  if (meta.id && (meta.id.indexOf('_sync') === -1) && !doc._deleted) {
    if (doc.type) {
      emit( doc.type );
    }
  }
}

N1QL - SELECT * FROM test_sw1 where meta().id not like ‘_sync:%’ and type = ‘user’ and (_deleted = false or _deleted IS MISSING)

Both return the correct amount of rows ~ 4K
Views - 1.115846118s
N1QL - 6.539213193s.

Indexes
CREATE INDEX type_not_deleted_ix ON test_sw1(type,_deleted,(meta().id))
CREATE INDEX over5 ON
test_sw1(type,(meta().id),_deleted) WHERE ((not
((meta().id) like “_sync:%”)) and ((_deleted = false) or (_deleted
is missing)))


#2

Can you post the EXPLAIN plan?


#3
[
  {
    "plan": {
      "#operator": "Sequence",
      "~children": [
        {
          "#operator": "IntersectScan",
          "scans": [
            {
              "#operator": "IndexScan",
              "index": "type_ix",
              "index_id": "71e5ae4863f63694",
              "keyspace": "test_sw1",
              "namespace": "default",
              "spans": [
                {
                  "Range": {
                    "High": [
                      "\"user\""
                    ],
                    "Inclusion": 3,
                    "Low": [
                      "\"user\""
                    ]
                  }
                }
              ],
              "using": "gsi"
            },
            {
              "#operator": "IndexScan",
              "index": "over5",
              "index_id": "6b1a913415806ec5",
              "keyspace": "test_sw1",
              "namespace": "default",
              "spans": [
                {
                  "Range": {
                    "High": [
                      "successor(\"user\")"
                    ],
                    "Inclusion": 1,
                    "Low": [
                      "\"user\""
                    ]
                  }
                }
              ],
              "using": "gsi"
            }
          ]
        },
        {
          "#operator": "Parallel",
          "~child": {
            "#operator": "Sequence",
            "~children": [
              {
                "#operator": "Fetch",
                "keyspace": "test_sw1",
                "namespace": "default"
              },
              {
                "#operator": "Filter",
                "condition": "(((not ((meta(`test_sw1`).`id`) like \"_sync:%\")) and ((`test_sw1`.`type`) = \"user\")) and (((`test_sw1`.`_deleted`) = false) or ((`test_sw1`.`_deleted`) is missing)))"
              },
              {
                "#operator": "InitialProject",
                "result_terms": [
                  {
                    "expr": "self",
                    "star": true
                  }
                ]
              },
              {
                "#operator": "FinalProject"
              }
            ]
          }
        }
      ]
    },
    "text": "SELECT * FROM `test_sw1`  where  meta().id not like '_sync:%' and type = 'user' and (_deleted = false or _deleted IS MISSING)"
  }
]

#4

Try using an index hint. You need this to avoid the index type_ix and use only index over5.

EXPLAIN SELECT *
FROM test_sw1 USE INDEX (over5)
WHERE meta().id NOT LIKE 'sync:%' AND type = 'user' AND (_deleted = FALSE OR _deleted IS MISSING)
;

#5
[
  {
    "plan": {
      "#operator": "Sequence",
      "~children": [
        {
          "#operator": "IndexScan",
          "index": "over5",
          "index_id": "6b1a913415806ec5",
          "keyspace": "test_sw1",
          "namespace": "default",
          "spans": [
            {
              "Range": {
                "High": [
                  "successor(\"user\")"
                ],
                "Inclusion": 1,
                "Low": [
                  "\"user\""
                ]
              }
            }
          ],
          "using": "gsi"
        },
        {
          "#operator": "Parallel",
          "~child": {
            "#operator": "Sequence",
            "~children": [
              {
                "#operator": "Fetch",
                "keyspace": "test_sw1",
                "namespace": "default"
              },
              {
                "#operator": "Filter",
                "condition": "(((not ((meta(`test_sw1`).`id`) like \"_sync:%\")) and ((`test_sw1`.`type`) = \"user\")) and (((`test_sw1`.`_deleted`) = false) or ((`test_sw1`.`_deleted`) is missing)))"
              },
              {
                "#operator": "InitialProject",
                "result_terms": [
                  {
                    "expr": "self",
                    "star": true
                  }
                ]
              },
              {
                "#operator": "FinalProject"
              }
            ]
          }
        }
      ]
    },
    "text": "SELECT * FROM `test_sw1` USE INDEX (over5 using GSI)  where  meta().id not like '_sync:%' and type = 'user' and (_deleted = false or _deleted IS MISSING)"
  }
]

#6

How long does the query take now, using this new query with the hint?


#7

View - 1.2 s
N1QL - 5.275459658 s

Is there a better way to remove all the deleted docs? Maybe create a new index?


#8

The view is only emitting doc.type, while N1QL is selecting everything.

Are there specific fields you want to return? If there are specific fields you want to return, you can use a covering index for better performance.


#10

ok. It it best to get all the IDs then do a bulk_fetch?


#11

Should not be necessary, but we have seen that sometimes. Can you try the covering index?


#12

Updating the Query to Select Id From…

This it the correct way to create a covering index?
Definition: CREATE INDEX over5_2 ON test_sw1(id) WHERE ((not ((meta().id) like “_sync:%”)) and ((_deleted = false) or (_deleted is missing)))

[
  {
    "plan": {
      "#operator": "Sequence",
      "~children": [
        {
          "#operator": "PrimaryScan",
          "index": "#primary",
          "keyspace": "test__sw1",
          "namespace": "default",
          "using": "gsi"
        },
        {
          "#operator": "Parallel",
          "~child": {
            "#operator": "Sequence",
            "~children": [
              {
                "#operator": "Fetch",
                "keyspace": "test__sw1",
                "namespace": "default"
              },
              {
                "#operator": "Filter",
                "condition": "(((not ((meta(`test__sw1`).`id`) like \"_sync:%\")) and ((`test__sw1`.`type`) = \"user\")) and (((`test__sw1`.`_deleted`) = false) or ((`test__sw1`.`_deleted`) is missing)))"
              },
              {
                "#operator": "InitialProject",
                "result_terms": [
                  {
                    "expr": "(`test__sw1`.`id`)"
                  }
                ]
              },
              {
                "#operator": "FinalProject"
              }
            ]
          }
        }
      ]
    },
    "text": "SELECT id FROM `test__sw1` USE INDEX(over5_2) where  meta().id not like '_sync:%' and type = 'user' and (_deleted = false or _deleted IS MISSING)"
  }
]

#13

It is SELECT META().id, not SELECT id.

A covering index means the index includes all the data in the query. So you can select other fields, if you add them to end of the index.

Can you post the EXPLAIN output and execution time for this query.

EXPLAIN SELECT META().id
FROM test_sw1 USE INDEX (over5)
WHERE meta().id NOT LIKE 'sync:%' AND type = 'user' AND (_deleted = FALSE OR _deleted IS MISSING)
;

#14
[
  {
    "plan": {
      "#operator": "Sequence",
      "~children": [
        {
          "#operator": "PrimaryScan",
          "index": "#primary",
          "keyspace": "test_sw1",
          "namespace": "default",
          "using": "gsi"
        },
        {
          "#operator": "Parallel",
          "~child": {
            "#operator": "Sequence",
            "~children": [
              {
                "#operator": "Fetch",
                "keyspace": "test_sw1",
                "namespace": "default"
              },
              {
                "#operator": "Filter",
                "condition": "(((not ((meta(`test_sw1`).`id`) like \"sync:%\")) and ((`test_sw1`.`type`) = \"user\")) and (((`test_sw1`.`_deleted`) = false) or ((`test_sw1`.`_deleted`) is missing)))"
              },
              {
                "#operator": "InitialProject",
                "result_terms": [
                  {
                    "expr": "(meta(`test_sw1`).`id`)"
                  }
                ]
              },
              {
                "#operator": "FinalProject"
              }
            ]
          }
        }
      ]
    },
    "text": "SELECT META().id\nFROM test_sw1 USE INDEX (over5)\nWHERE meta().id NOT LIKE 'sync:%' AND type = 'user' AND (_deleted = FALSE OR _deleted IS MISSING)\n;"
  }
]

#15

You changed

NOT LIKE ‘_sync’

to

NOT LIKE ‘sync’.

You need to be consistent between the index and the query. You need to see IndexScan and cover() in the EXPLAIN output. And then you need to post the execution time from actually running the query…


#17

That was copied from the prev Post

Here is the output and I see the cover now

[
  {
    "plan": {
      "#operator": "Sequence",
      "~children": [
        {
          "#operator": "IndexScan",
          "covers": [
            "cover ((`test_sw1`.`type`))",
            "cover ((meta(`test_sw1`).`id`))",
            "cover ((`test_sw1`.`_deleted`))",
            "cover ((meta(`test_sw1`).`id`))"
          ],
          "filter_covers": {
            "cover ((not ((meta(`test_sw1`).`id`) like \"_sync:%\")))": true
          },
          "index": "over5",
          "index_id": "62110fa20a745e1d",
          "keyspace": "test_sw1",
          "namespace": "default",
          "spans": [
            {
              "Range": {
                "High": [
                  "successor(\"user\")"
                ],
                "Inclusion": 1,
                "Low": [
                  "\"user\""
                ]
              }
            }
          ],
          "using": "gsi"
        },
        {
          "#operator": "Parallel",
          "~child": {
            "#operator": "Sequence",
            "~children": [
              {
                "#operator": "Filter",
                "condition": "((cover ((not ((meta(`test_sw1`).`id`) like \"_sync:%\"))) and (cover ((`test_sw1`.`type`)) = \"user\")) and ((cover ((`test_sw1`.`_deleted`)) = false) or (cover ((`test_sw1`.`_deleted`)) is missing)))"
              },
              {
                "#operator": "InitialProject",
                "result_terms": [
                  {
                    "expr": "cover ((meta(`test_sw1`).`id`))"
                  }
                ]
              },
              {
                "#operator": "FinalProject"
              }
            ]
          }
        }
      ]
    },
    "text": "SELECT META().id\nFROM test_sw1 USE INDEX (over5)\nWHERE meta().id NOT LIKE '_sync:%' AND type = 'user' AND (_deleted = FALSE OR _deleted IS MISSING)\n;"
  }
]

#18

Ok. Now you need to remove the word “EXPLAIN” and run the query, and then post the metrics here. Please avoid copy-paste errors and use that latest query.


#19

Thanks.

View - 1.91s
N1QL - 1.04s

I will be testing more complex views and N1QL.


#20

Another question on Joins on how to better improve this query? What kind of Indexing should be needed?

Some notes:

  1. usr.locations is a hash and we need to see if the key exists, this can be different
  2. pa.residentId == usr.id
  3. I get the same results as my view.

Query
EXPLAIN Select usr.id as userId, pa.id as paId
FROM sync_gateway_sw1 pa INNER JOIN sync_gateway_sw1 usr ON KEYS pa.residentId
WHERE meta().id NOT LIKE ‘_sync:%’ AND (meta()._deleted = FALSE OR meta()._deleted IS MISSING) AND usr.type = ‘user’ AND
usr.resident = true
and usr.locations.a3a2921e-96d9-45c9-aeb2-cc83410ad9c7 IS NOT NULL
and pa.type = ‘place-association’ and pa.locationId = ‘a3a2921e-96d9-45c9-aeb2-cc83410ad9c7’

[
  {
    "plan": {
      "#operator": "Sequence",
      "~children": [
        {
          "#operator": "IndexScan",
          "index": "type_ix",
          "index_id": "71e5ae4863f63694",
          "keyspace": "sync_gateway_sw1",
          "namespace": "default",
          "spans": [
            {
              "Range": {
                "High": [
                  "\"place-association\""
                ],
                "Inclusion": 3,
                "Low": [
                  "\"place-association\""
                ]
              }
            }
          ],
          "using": "gsi"
        },
        {
          "#operator": "Parallel",
          "~child": {
            "#operator": "Sequence",
            "~children": [
              {
                "#operator": "Fetch",
                "as": "pa",
                "keyspace": "sync_gateway_sw1",
                "namespace": "default"
              },
              {
                "#operator": "Join",
                "as": "usr",
                "keyspace": "sync_gateway_sw1",
                "namespace": "default",
                "on_keys": "(`pa`.`residentId`)"
              },
              {
                "#operator": "Filter",
                "condition": "(((((((not ((meta().`id`) like \"_sync:%\")) and (((meta().`_deleted`) = false) or ((meta().`_deleted`) is missing))) and ((`usr`.`type`) = \"user\")) and ((`usr`.`resident`) = true)) and (((`usr`.`locations`).`a3a2921e-96d9-45c9-aeb2-cc83410ad9c7`) is not null)) and ((`pa`.`type`) = \"place-association\")) and ((`pa`.`locationId`) = \"a3a2921e-96d9-45c9-aeb2-cc83410ad9c7\"))"
              },
              {
                "#operator": "InitialProject",
                "result_terms": [
                  {
                    "as": "userId",
                    "expr": "(`usr`.`id`)"
                  },
                  {
                    "as": "paId",
                    "expr": "(`pa`.`id`)"
                  }
                ]
              },
              {
                "#operator": "FinalProject"
              }
            ]
          }
        }
      ]
    },
    "text": "Select usr.id as userId, pa.id as paId\nFROM sync_gateway_sw1 pa INNER JOIN sync_gateway_sw1 usr ON KEYS pa.residentId\nWHERE meta().id NOT LIKE '_sync:%' AND (meta()._deleted = FALSE OR meta()._deleted IS MISSING)  AND usr.type = 'user' AND\nusr.resident = true\nand usr.locations.`a3a2921e-96d9-45c9-aeb2-cc83410ad9c7` IS NOT NULL\nand pa.type = 'place-association' and pa.locationId = 'a3a2921e-96d9-45c9-aeb2-cc83410ad9c7'"
  }
]

#21

I would suggest two changes:

(1) Create an index on pa ( locationId, type, id, residentId ). You should see cover() in your EXPLAIN.

(2) Change all occurrences of META() to META(pa).


#22

I am having trouble creating the index for pa

I have tried

CREATE INDEX pa2_idx ON pa:sync_gateway_sw1(locations, type, id, residentId)
[
  {
    "code": 12003,
    "msg": "Keyspace not found keyspace sync_gateway_sw1 - cause: No bucket named sync_gateway_sw1",
    "query_from_user": "CREATE INDEX pa2_idx ON pa:sync_gateway_sw1(locations, type, id, residentId)"
  }
]

AND

CREATE INDEX pa2_idx ON sync_gateway_sw1:pa(locations, type, id, residentId)