Skip to content

Commit

Permalink
added tokenSplitRegex option
Browse files Browse the repository at this point in the history
  • Loading branch information
fergiemcdowall committed Apr 23, 2022
1 parent b6d85be commit 65ad3dc
Show file tree
Hide file tree
Showing 11 changed files with 111 additions and 36 deletions.
4 changes: 2 additions & 2 deletions dist/search-index-3.0.3.js → dist/search-index-3.1.0.js

Large diffs are not rendered by default.

File renamed without changes.

Large diffs are not rendered by default.

File renamed without changes.
4 changes: 2 additions & 2 deletions dist/search-index.js

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion docs/API.md
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ If any document does not contain an `_id` field, then one will be
generated and assigned


`options` is an optional object that can contain the following values:
`options` is an optional object that can contain the following values. These values can also be set when initialising the index rather than in every `PUT`:

| Name | Type | Default | Description |
|---|---|---|---|
Expand All @@ -366,6 +366,7 @@ generated and assigned
|`storeRawDocs`|`boolean`|`true`|Whether to store the raw document or not. In many cases it may be desirable to store it externally, or to skip storing when indexing if it is going to be updated directly later on|
|`storeVectors`|`boolean`|`false`|When `true`, documents will be deletable and overwritable, but will take up more space on disk|
|`tokenizationPipeline`|`Array`|<pre lang="javascript">[<br> SPLIT,<br> SKIP,<br> LOWCASE,<br> REPLACE,<br> NGRAMS,<br> STOPWORDS,<br> SCORE_TERM_FREQUENCY<br>]</pre>| Tokenisation pipeline. Stages can be added and reordered|
|`tokenSplitRegex`|`RegExp`| `/[\p{L}\d]+/gu` | The regular expression that splits strings into tokens|

### Tokenization pipeline when indexing

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "search-index",
"version": "3.0.3",
"version": "3.1.0",
"description": "A network resilient, persistent full-text search library for the browser and Node.js",
"keywords": [
"search",
Expand Down
7 changes: 3 additions & 4 deletions src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,11 @@ const initIndex = (ops = {}) =>
ngrams: {},
replace: {},
storeRawDocs: true,
// TODO: processDocuments probably shouldn't be an option?

tokenizer: tp.tokenizer,
stopwords: [],
storeVectors: true, // TODO: make a test for this being false
tokenAppend: '#'
tokenAppend: '#',
tokenSplitRegex: /[\p{L}\d]+/gu,
tokenizer: tp.tokenizer
},
ops
)
Expand Down
2 changes: 1 addition & 1 deletion src/tokenisationPipeline.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ const tv = require('term-vector')
const ngraminator = require('ngraminator')

const SPLIT = ([tokens, field, ops]) =>
Promise.resolve([tokens.match(/[\p{L}\d]+/gu) || [], field, ops])
Promise.resolve([tokens.match(ops.tokenSplitRegex) || [], field, ops])

const SKIP = ([tokens, field, ops]) => [
ops.skipFields.includes(field) ? [] : tokens,
Expand Down
23 changes: 0 additions & 23 deletions test/src/issue-571-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,27 +56,4 @@ test('set up as per issue #571', async function (t) {
RESULT_LENGTH: 1
}
)

// t.ok(await FLUSH())

// t.deepEquals(await SEARCH(['q']), {
// RESULT: [],
// RESULT_LENGTH: 0
// })

// t.deepEquals(await PUT(docs), [
// { _id: 'qwertyu', status: 'CREATED', operation: 'PUT' },
// { _id: 'asdfgh', status: 'CREATED', operation: 'PUT' }
// ])

// t.deepEquals(await SEARCH(['q']), {
// RESULT: [
// {
// _id: 'qwertyu',
// _match: [{ FIELD: 'idx', VALUE: 'q', SCORE: '1.00' }],
// _score: 1.1
// }
// ],
// RESULT_LENGTH: 1
// })
})
98 changes: 98 additions & 0 deletions test/src/issue-577-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
const si = require('../../')
const test = require('tape')

const sandbox = 'test/sandbox/'

const docs = [
{
_id: 1,
bandName: '*',
description: 'The nice boys of pop'
},
{
_id: 'two',
bandName: 'The Rolling Stones',
description: 'The bad boys of rock'
},
{
_id: 3,
bandName: 'The Who',
description: 'Nearly as good as Led Zeppelin'
}
]

test('set up as per issue #577', async function (t) {
t.plan(2)

const { PUT, SEARCH } = await si({
name: sandbox + '577',
tokenSplitRegex: /[\p{L}\d*]+/gu
})

t.deepEquals(await PUT(docs), [
{ _id: 1, operation: 'PUT', status: 'CREATED' },
{ _id: 'two', operation: 'PUT', status: 'CREATED' },
{ _id: 3, operation: 'PUT', status: 'CREATED' }
])

t.deepEquals(
await SEARCH(['*'], {
DOCUMENTS: true
}),
{
RESULT: [
{
_id: 1,
_match: [{ FIELD: 'bandname', VALUE: '*', SCORE: '1.00' }],
_score: 1.39,
_doc: {
_id: 1,
bandName: '*',
description: 'The nice boys of pop'
}
}
],
RESULT_LENGTH: 1
}
)
})

test('set up as per issue #577', async function (t) {
t.plan(2)

const { PUT, SEARCH } = await si({
name: sandbox + '577-2'
})

t.deepEquals(
await PUT(docs, {
tokenSplitRegex: /[\p{L}\d*]+/gu
}),
[
{ _id: 1, operation: 'PUT', status: 'CREATED' },
{ _id: 'two', operation: 'PUT', status: 'CREATED' },
{ _id: 3, operation: 'PUT', status: 'CREATED' }
]
)

t.deepEquals(
await SEARCH(['*'], {
DOCUMENTS: true
}),
{
RESULT: [
{
_id: 1,
_match: [{ FIELD: 'bandname', VALUE: '*', SCORE: '1.00' }],
_score: 1.39,
_doc: {
_id: 1,
bandName: '*',
description: 'The nice boys of pop'
}
}
],
RESULT_LENGTH: 1
}
)
})

0 comments on commit 65ad3dc

Please sign in to comment.