From bb464f15a8b6d7d27184354621086eb18b6810f9 Mon Sep 17 00:00:00 2001 From: tesshucom Date: Sun, 15 Sep 2019 22:09:21 +0900 Subject: [PATCH 1/2] =?UTF-8?q?Apply=20boost=20values=20=E2=80=8B=E2=80=8B?= =?UTF-8?q?to=20search=20queries?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../airsonic/player/service/search/IndexType.java | 12 +++++------- .../airsonic/player/service/search/QueryFactory.java | 12 +++++++++--- .../player/service/search/QueryFactoryTestCase.java | 12 ++++++------ 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexType.java b/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexType.java index 2edcf28e..a433a5ec 100644 --- a/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexType.java +++ b/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexType.java @@ -39,7 +39,7 @@ public enum IndexType { FieldNames.TITLE, FieldNames.ARTIST), boosts( - entry(FieldNames.TITLE, 2F))), + entry(FieldNames.TITLE, 1.1F))), ALBUM( fieldNames( @@ -47,7 +47,7 @@ public enum IndexType { FieldNames.ARTIST), // FieldNames.FOLDER), // XXX 3.x -> 8.x : Remove folder from multi-field search condition boosts( - entry(FieldNames.ALBUM, 2F))), + entry(FieldNames.ALBUM, 1.1F))), ALBUM_ID3( fieldNames( @@ -55,20 +55,18 @@ public enum IndexType { FieldNames.ARTIST), // FieldNames.FOLDER_ID), // XXX 3.x -> 8.x : Remove folder from multi-field search condition boosts( - entry(FieldNames.ALBUM, 2F))), + entry(FieldNames.ALBUM, 1.1F))), ARTIST( fieldNames( FieldNames.ARTIST), // FieldNames.FOLDER), // XXX 3.x -> 8.x : Remove folder from multi-field search condition - boosts( - entry(FieldNames.ARTIST, 1F))), + boosts()), ARTIST_ID3( fieldNames( FieldNames.ARTIST), - boosts( - entry(FieldNames.ARTIST, 2F))), + boosts()), ; diff --git a/airsonic-main/src/main/java/org/airsonic/player/service/search/QueryFactory.java b/airsonic-main/src/main/java/org/airsonic/player/service/search/QueryFactory.java index b5155868..74236762 100644 --- a/airsonic-main/src/main/java/org/airsonic/player/service/search/QueryFactory.java +++ b/airsonic-main/src/main/java/org/airsonic/player/service/search/QueryFactory.java @@ -31,6 +31,7 @@ import org.apache.lucene.document.IntPoint; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; @@ -101,7 +102,8 @@ public class QueryFactory { * - Self made parser process reduces one library dependency. * - It is easy to make corrections later when changing the query to improve search accuracy. */ - private Query createMultiFieldWildQuery(@NonNull String[] fieldNames, @NonNull String queryString) + private Query createMultiFieldWildQuery(@NonNull String[] fieldNames, @NonNull String queryString, + @NonNull IndexType indexType) throws IOException { BooleanQuery.Builder mainQuery = new BooleanQuery.Builder(); @@ -117,7 +119,11 @@ public class QueryFactory { while (stream.incrementToken()) { String token = stream.getAttribute(CharTermAttribute.class).toString(); WildcardQuery wildcardQuery = new WildcardQuery(new Term(fieldName, token.concat(ASTERISK))); - fieldQuerys.add(wildcardQuery); + if (indexType.getBoosts().containsKey(fieldName)) { + fieldQuerys.add(new BoostQuery(wildcardQuery, indexType.getBoosts().get(fieldName))); + } else { + fieldQuerys.add(wildcardQuery); + } } fieldsQuerys.add(fieldQuerys); } @@ -169,7 +175,7 @@ public class QueryFactory { BooleanQuery.Builder mainQuery = new BooleanQuery.Builder(); - Query multiFieldQuery = createMultiFieldWildQuery(indexType.getFields(), criteria.getQuery()); + Query multiFieldQuery = createMultiFieldWildQuery(indexType.getFields(), criteria.getQuery(), indexType); mainQuery.add(multiFieldQuery, Occur.MUST); boolean isId3 = indexType == IndexType.ALBUM_ID3 || indexType == IndexType.ARTIST_ID3; diff --git a/airsonic-main/src/test/java/org/airsonic/player/service/search/QueryFactoryTestCase.java b/airsonic-main/src/test/java/org/airsonic/player/service/search/QueryFactoryTestCase.java index dda0d023..42107686 100644 --- a/airsonic-main/src/test/java/org/airsonic/player/service/search/QueryFactoryTestCase.java +++ b/airsonic-main/src/test/java/org/airsonic/player/service/search/QueryFactoryTestCase.java @@ -123,13 +123,13 @@ public class QueryFactoryTestCase { Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.ALBUM); assertEquals("SearchAlbum", - "+((album:abc* artist:abc*) (album:def* artist:def*)) +(folder:" + PATH1 + "+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) +(folder:" + PATH1 + ")", query.toString()); query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.ALBUM); assertEquals("SearchAlbum", - "+((album:abc* artist:abc*) (album:def* artist:def*)) +(folder:" + PATH1 + "+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) +(folder:" + PATH1 + " folder:" + PATH2 + ")", query.toString()); } @@ -143,11 +143,11 @@ public class QueryFactoryTestCase { Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.SONG); assertEquals("SearchSong", - "+((title:abc* artist:abc*) (title:def* artist:def*)) +(folder:" + PATH1 + ")", + "+(((title:abc*)^1.1 artist:abc*) ((title:def*)^1.1 artist:def*)) +(folder:" + PATH1 + ")", query.toString()); query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.SONG); - assertEquals("SearchSong", "+((title:abc* artist:abc*) (title:def* artist:def*)) +(folder:" + PATH1 + assertEquals("SearchSong", "+(((title:abc*)^1.1 artist:abc*) ((title:def*)^1.1 artist:def*)) +(folder:" + PATH1 + " folder:" + PATH2 + ")", query.toString()); } @@ -178,13 +178,13 @@ public class QueryFactoryTestCase { Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.ALBUM_ID3); assertEquals( - "SearchAlbumId3", "+((album:abc* artist:abc*) (album:def* artist:def*)) " + "SearchAlbumId3", "+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) " + "+(folderId:" + FID1 + ")", query.toString()); query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.ALBUM_ID3); assertEquals("SearchAlbumId3", - "+((album:abc* artist:abc*) (album:def* artist:def*)) +(folderId:" + "+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) +(folderId:" + FID1 + " folderId:" + FID2 + ")", query.toString()); From dba8610ff7f15331b922f4a0077a42e8b9c543b5 Mon Sep 17 00:00:00 2001 From: tesshucom Date: Sun, 15 Sep 2019 22:13:15 +0900 Subject: [PATCH 2/2] Apply stopwords dedicated to music search - Iterate index version. --- .../service/search/AnalyzerFactory.java | 30 ++++++- .../player/service/search/IndexManager.java | 2 +- .../service/search/analysis/stopwords.txt | 37 ++++++++ .../search/analysis/stopwords_artist.txt | 45 ++++++++++ .../search/AnalyzerFactoryTestCase.java | 89 ++++++++++++------- ...archServiceStartWithStopwardsTestCase.java | 4 +- 6 files changed, 170 insertions(+), 37 deletions(-) create mode 100644 airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords.txt create mode 100644 airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords_artist.txt diff --git a/airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java b/airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java index eeae8b9a..afee1aa1 100644 --- a/airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java +++ b/airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java @@ -51,6 +51,10 @@ import static org.springframework.util.ObjectUtils.isEmpty; @Component public final class AnalyzerFactory { + private static final String STOP_WORDS = "org/airsonic/player/service/search/analysis/stopwords.txt"; + + private static final String STOP_WORDS_ARTIST = "org/airsonic/player/service/search/analysis/stopwords_artist.txt"; + private Analyzer analyzer; private Analyzer queryAnalyzer; @@ -105,7 +109,19 @@ public final class AnalyzerFactory { .addTokenFilter(CJKWidthFilterFactory.class) .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false") .addTokenFilter(LowerCaseFilterFactory.class) - .addTokenFilter(StopFilterFactory.class) + .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS) + .addTokenFilter(EnglishPossessiveFilterFactory.class); + addTokenFilterForUnderscoreRemovalAroundToken(builder); + return builder; + } + + private Builder createArtistAnalyzerBuilder() throws IOException { + Builder builder = CustomAnalyzer.builder() + .withTokenizer(StandardTokenizerFactory.class) + .addTokenFilter(CJKWidthFilterFactory.class) + .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false") + .addTokenFilter(LowerCaseFilterFactory.class) + .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST) .addTokenFilter(EnglishPossessiveFilterFactory.class); addTokenFilterForUnderscoreRemovalAroundToken(builder); return builder; @@ -134,7 +150,15 @@ public final class AnalyzerFactory { public Analyzer getAnalyzer() throws IOException { if (isEmpty(analyzer)) { try { - analyzer = createDefaultAnalyzerBuilder().build(); + + Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build(); + Analyzer artistAnalyzer = createArtistAnalyzerBuilder().build(); + + Map fieldAnalyzers = new HashMap<>(); + fieldAnalyzers.put(FieldNames.ARTIST, artistAnalyzer); + + analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers); + } catch (IOException e) { throw new IOException("Error when initializing Analyzer.", e); } @@ -161,9 +185,11 @@ public final class AnalyzerFactory { try { Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build(); + Analyzer artistAnalyzer = createArtistAnalyzerBuilder().build(); Analyzer genreAnalyzer = createGenreAnalyzerBuilder().build(); Map fieldAnalyzers = new HashMap<>(); + fieldAnalyzers.put(FieldNames.ARTIST, artistAnalyzer); fieldAnalyzers.put(FieldNames.GENRE, genreAnalyzer); queryAnalyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers); diff --git a/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexManager.java b/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexManager.java index dd859287..c2562fc2 100644 --- a/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexManager.java +++ b/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexManager.java @@ -74,7 +74,7 @@ public class IndexManager { * DocumentFactory or the class that they use. * */ - private static final int INDEX_VERSION = 16; + private static final int INDEX_VERSION = 17; /** * Literal name of index top directory. diff --git a/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords.txt b/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords.txt new file mode 100644 index 00000000..c080f0a0 --- /dev/null +++ b/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords.txt @@ -0,0 +1,37 @@ +# This file is part of Airsonic. +# +# Airsonic is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Airsonic is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Airsonic. If not, see . +# Copyright 2016 (C) Airsonic Authors + +# This file defines stop words suitable for music search. +# See EnglishAnalyzer.ENGLISH_STOP_WORDS_SET for default stopwords. +# +# a, an, and, are, as, at, +# be, but, by, for, if, in, +# into, is, it, no, not, of, +# on, or, such, that, the, +# their, then, there, these, +# they, this, to, was, will, +# with + +# Ignore articles that are used by default in the index. +# See SettingsService.DEFAULT_IGNORED_ARTICLES + +a +an +the +el +las +le +les \ No newline at end of file diff --git a/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords_artist.txt b/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords_artist.txt new file mode 100644 index 00000000..b7256418 --- /dev/null +++ b/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords_artist.txt @@ -0,0 +1,45 @@ +# This file is part of Airsonic. +# +# Airsonic is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Airsonic is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Airsonic. If not, see . +# Copyright 2016 (C) Airsonic Authors + +# This file defines stop words suitable for music search. +# See EnglishAnalyzer.ENGLISH_STOP_WORDS_SET for default stopwords. +# +# a, an, and, are, as, at, +# be, but, by, for, if, in, +# into, is, it, no, not, of, +# on, or, such, that, the, +# their, then, there, these, +# they, this, to, was, will, +# with + +# Ignore articles that are used by default in the index. +# See SettingsService.DEFAULT_IGNORED_ARTICLES + +a +an +the +el +las +le +les + +# Unique conjunctions often used in artist fields. + +by +cv +feat +vs +with diff --git a/airsonic-main/src/test/java/org/airsonic/player/service/search/AnalyzerFactoryTestCase.java b/airsonic-main/src/test/java/org/airsonic/player/service/search/AnalyzerFactoryTestCase.java index 4e56142d..2c58750e 100644 --- a/airsonic-main/src/test/java/org/airsonic/player/service/search/AnalyzerFactoryTestCase.java +++ b/airsonic-main/src/test/java/org/airsonic/player/service/search/AnalyzerFactoryTestCase.java @@ -14,6 +14,7 @@ import java.util.List; import static java.util.Arrays.asList; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; /** * Test case for Analyzer. @@ -187,33 +188,46 @@ public class AnalyzerFactoryTestCase { String queryStop = "and are as at be but by for if in into is it no not of on " // + "or such that their then there these they this to was will with"; + /* + * Unique conjunctions often used in artist fields. + */ + String stopwordsForArtist = "by cv feat vs with"; + Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> { List articleTerms = toTermString(n, queryArticle); List indexArticleTerms = toTermString(n, queryArticle4Index); List stopedTerms = toTermString(n, queryStop); + List artistTerms = toTermString(n, stopwordsForArtist); switch (n) { - case FieldNames.FOLDER: - case FieldNames.MEDIA_TYPE: - case FieldNames.GENRE: - case FieldNames.ARTIST: case FieldNames.ALBUM: case FieldNames.TITLE: - // It is removed because it is included in ENGLISH_STOP_WORDS_SET. + // Deleted because it is a stopword. assertEquals("article : " + n, 0, articleTerms.size()); - // Not removed because it is not included in ENGLISH_STOP_WORDS_SET. - assertEquals("sonic server index article: " + n, 6, indexArticleTerms.size()); - // It is removed because it is included in ENGLISH_STOP_WORDS_SET. - assertEquals("non-article stop words : " + n, 0, stopedTerms.size()); + // "la los" is not deleted(#1235). + assertEquals("sonic server index article: " + n, 2, indexArticleTerms.size()); + // Not deleted because it is not a stopword. + assertEquals("non-article stop words : " + n, 30, stopedTerms.size()); + // Not deleted because it is not a stopword. + assertEquals("stop words for artsist : " + n, 5, artistTerms.size()); break; - // Legacy has common behavior for all fields. - default: + case FieldNames.ARTIST: + + // Deleted because it is a stopword. assertEquals("article : " + n, 0, articleTerms.size()); - assertEquals("sonic server index article: " + n, 6, indexArticleTerms.size()); - assertEquals("non-article stop words : " + n, 0, stopedTerms.size()); + // "la los" is not deleted(#1235). + assertEquals("sonic server index article: " + n, 2, indexArticleTerms.size()); + // Not deleted because it is not a stopword(Except by and with). + assertEquals("non-article stop words : " + n, 28, stopedTerms.size()); + // Deleted because it is a stopword. + assertEquals("stop words for artsist : " + n, 0, artistTerms.size()); + break; + + default: + fail(); // no analyze field is not applicable break; } }); @@ -239,14 +253,16 @@ public class AnalyzerFactoryTestCase { public void testStopwardAndFullWidth() { /* - * Stop word is removed. + * This and is not deleted because they are different from the default stopword. */ String queryHalfWidth = "THIS IS FULL-WIDTH SENTENCES."; List terms = toTermString(queryHalfWidth); - assertEquals(3, terms.size()); - assertEquals("full", terms.get(0)); - assertEquals("width", terms.get(1)); - assertEquals("sentences", terms.get(2)); + assertEquals(5, terms.size()); + assertEquals("this", terms.get(0)); + assertEquals("is", terms.get(1)); + assertEquals("full", terms.get(2)); + assertEquals("width", terms.get(3)); + assertEquals("sentences", terms.get(4)); /* * Legacy can avoid Stopward if it is full width. @@ -264,10 +280,12 @@ public class AnalyzerFactoryTestCase { * The filter order has been changed properly * as it is probably not a deliberate specification. */ - assertEquals(3, terms.size()); - assertEquals("full", terms.get(0)); - assertEquals("width", terms.get(1)); - assertEquals("sentences", terms.get(2)); + assertEquals(5, terms.size()); + assertEquals("this", terms.get(0)); + assertEquals("is", terms.get(1)); + assertEquals("full", terms.get(2)); + assertEquals("width", terms.get(3)); + assertEquals("sentences", terms.get(4)); } @@ -537,9 +555,11 @@ public class AnalyzerFactoryTestCase { */ String query = "This is Airsonic's analysis."; List terms = toTermString(query); - assertEquals(2, terms.size()); - assertEquals("airsonic", terms.get(0)); - assertEquals("analysis", terms.get(1)); + assertEquals(4, terms.size()); + assertEquals("this", terms.get(0));// Not deleted because it is not a stopword + assertEquals("is", terms.get(1));// Not deleted because it is not a stopword + assertEquals("airsonic", terms.get(2)); + assertEquals("analysis", terms.get(3)); /* * XXX 3.x -> 8.x : @@ -587,13 +607,18 @@ public class AnalyzerFactoryTestCase { */ String query = "This is formed with a form of the verb \"have\" and a past participl."; List terms = toTermString(query); - assertEquals(6, terms.size()); - assertEquals("formed", terms.get(0));// leave passive / not "form" - assertEquals("form", terms.get(1)); - assertEquals("verb", terms.get(2)); - assertEquals("have", terms.get(3)); - assertEquals("past", terms.get(4)); - assertEquals("participl", terms.get(5)); + assertEquals(11, terms.size()); + assertEquals("this", terms.get(0));// Not deleted because it is not a stopword + assertEquals("is", terms.get(1));// Not deleted because it is not a stopword + assertEquals("formed", terms.get(2));// leave passive / not "form" + assertEquals("with", terms.get(3));// Not deleted because it is not a stopword + assertEquals("form", terms.get(4)); + assertEquals("of", terms.get(5)); + assertEquals("verb", terms.get(6)); + assertEquals("have", terms.get(7)); + assertEquals("and", terms.get(8));// Not deleted because it is not a stopword + assertEquals("past", terms.get(9)); + assertEquals("participl", terms.get(10)); } diff --git a/airsonic-main/src/test/java/org/airsonic/player/service/search/SearchServiceStartWithStopwardsTestCase.java b/airsonic-main/src/test/java/org/airsonic/player/service/search/SearchServiceStartWithStopwardsTestCase.java index aea0639d..0a060b22 100644 --- a/airsonic-main/src/test/java/org/airsonic/player/service/search/SearchServiceStartWithStopwardsTestCase.java +++ b/airsonic-main/src/test/java/org/airsonic/player/service/search/SearchServiceStartWithStopwardsTestCase.java @@ -57,8 +57,8 @@ public class SearchServiceStartWithStopwardsTestCase extends AbstractAirsonicHom criteria.setQuery("will"); SearchResult result = searchService.search(criteria, folders, IndexType.ARTIST_ID3); - // XXX 3.x -> 8.x : The filter is properly applied to the input(Stopward) - Assert.assertEquals("Williams hit by \"will\" ", 0, result.getTotalHits()); + // Will hit because Airsonic's stopword is defined(#1235) + Assert.assertEquals("Williams hit by \"will\" ", 1, result.getTotalHits()); criteria.setQuery("the"); result = searchService.search(criteria, folders, IndexType.SONG);