From dba8610ff7f15331b922f4a0077a42e8b9c543b5 Mon Sep 17 00:00:00 2001 From: tesshucom Date: Sun, 15 Sep 2019 22:13:15 +0900 Subject: [PATCH] Apply stopwords dedicated to music search - Iterate index version. --- .../service/search/AnalyzerFactory.java | 30 ++++++- .../player/service/search/IndexManager.java | 2 +- .../service/search/analysis/stopwords.txt | 37 ++++++++ .../search/analysis/stopwords_artist.txt | 45 ++++++++++ .../search/AnalyzerFactoryTestCase.java | 89 ++++++++++++------- ...archServiceStartWithStopwardsTestCase.java | 4 +- 6 files changed, 170 insertions(+), 37 deletions(-) create mode 100644 airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords.txt create mode 100644 airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords_artist.txt diff --git a/airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java b/airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java index eeae8b9a..afee1aa1 100644 --- a/airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java +++ b/airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java @@ -51,6 +51,10 @@ import static org.springframework.util.ObjectUtils.isEmpty; @Component public final class AnalyzerFactory { + private static final String STOP_WORDS = "org/airsonic/player/service/search/analysis/stopwords.txt"; + + private static final String STOP_WORDS_ARTIST = "org/airsonic/player/service/search/analysis/stopwords_artist.txt"; + private Analyzer analyzer; private Analyzer queryAnalyzer; @@ -105,7 +109,19 @@ public final class AnalyzerFactory { .addTokenFilter(CJKWidthFilterFactory.class) .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false") .addTokenFilter(LowerCaseFilterFactory.class) - .addTokenFilter(StopFilterFactory.class) + .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS) + .addTokenFilter(EnglishPossessiveFilterFactory.class); + addTokenFilterForUnderscoreRemovalAroundToken(builder); + return builder; + } + + private Builder createArtistAnalyzerBuilder() throws IOException { + Builder builder = CustomAnalyzer.builder() + .withTokenizer(StandardTokenizerFactory.class) + .addTokenFilter(CJKWidthFilterFactory.class) + .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false") + .addTokenFilter(LowerCaseFilterFactory.class) + .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST) .addTokenFilter(EnglishPossessiveFilterFactory.class); addTokenFilterForUnderscoreRemovalAroundToken(builder); return builder; @@ -134,7 +150,15 @@ public final class AnalyzerFactory { public Analyzer getAnalyzer() throws IOException { if (isEmpty(analyzer)) { try { - analyzer = createDefaultAnalyzerBuilder().build(); + + Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build(); + Analyzer artistAnalyzer = createArtistAnalyzerBuilder().build(); + + Map fieldAnalyzers = new HashMap<>(); + fieldAnalyzers.put(FieldNames.ARTIST, artistAnalyzer); + + analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers); + } catch (IOException e) { throw new IOException("Error when initializing Analyzer.", e); } @@ -161,9 +185,11 @@ public final class AnalyzerFactory { try { Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build(); + Analyzer artistAnalyzer = createArtistAnalyzerBuilder().build(); Analyzer genreAnalyzer = createGenreAnalyzerBuilder().build(); Map fieldAnalyzers = new HashMap<>(); + fieldAnalyzers.put(FieldNames.ARTIST, artistAnalyzer); fieldAnalyzers.put(FieldNames.GENRE, genreAnalyzer); queryAnalyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers); diff --git a/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexManager.java b/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexManager.java index dd859287..c2562fc2 100644 --- a/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexManager.java +++ b/airsonic-main/src/main/java/org/airsonic/player/service/search/IndexManager.java @@ -74,7 +74,7 @@ public class IndexManager { * DocumentFactory or the class that they use. * */ - private static final int INDEX_VERSION = 16; + private static final int INDEX_VERSION = 17; /** * Literal name of index top directory. diff --git a/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords.txt b/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords.txt new file mode 100644 index 00000000..c080f0a0 --- /dev/null +++ b/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords.txt @@ -0,0 +1,37 @@ +# This file is part of Airsonic. +# +# Airsonic is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Airsonic is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Airsonic. If not, see . +# Copyright 2016 (C) Airsonic Authors + +# This file defines stop words suitable for music search. +# See EnglishAnalyzer.ENGLISH_STOP_WORDS_SET for default stopwords. +# +# a, an, and, are, as, at, +# be, but, by, for, if, in, +# into, is, it, no, not, of, +# on, or, such, that, the, +# their, then, there, these, +# they, this, to, was, will, +# with + +# Ignore articles that are used by default in the index. +# See SettingsService.DEFAULT_IGNORED_ARTICLES + +a +an +the +el +las +le +les \ No newline at end of file diff --git a/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords_artist.txt b/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords_artist.txt new file mode 100644 index 00000000..b7256418 --- /dev/null +++ b/airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords_artist.txt @@ -0,0 +1,45 @@ +# This file is part of Airsonic. +# +# Airsonic is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Airsonic is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Airsonic. If not, see . +# Copyright 2016 (C) Airsonic Authors + +# This file defines stop words suitable for music search. +# See EnglishAnalyzer.ENGLISH_STOP_WORDS_SET for default stopwords. +# +# a, an, and, are, as, at, +# be, but, by, for, if, in, +# into, is, it, no, not, of, +# on, or, such, that, the, +# their, then, there, these, +# they, this, to, was, will, +# with + +# Ignore articles that are used by default in the index. +# See SettingsService.DEFAULT_IGNORED_ARTICLES + +a +an +the +el +las +le +les + +# Unique conjunctions often used in artist fields. + +by +cv +feat +vs +with diff --git a/airsonic-main/src/test/java/org/airsonic/player/service/search/AnalyzerFactoryTestCase.java b/airsonic-main/src/test/java/org/airsonic/player/service/search/AnalyzerFactoryTestCase.java index 4e56142d..2c58750e 100644 --- a/airsonic-main/src/test/java/org/airsonic/player/service/search/AnalyzerFactoryTestCase.java +++ b/airsonic-main/src/test/java/org/airsonic/player/service/search/AnalyzerFactoryTestCase.java @@ -14,6 +14,7 @@ import java.util.List; import static java.util.Arrays.asList; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; /** * Test case for Analyzer. @@ -187,33 +188,46 @@ public class AnalyzerFactoryTestCase { String queryStop = "and are as at be but by for if in into is it no not of on " // + "or such that their then there these they this to was will with"; + /* + * Unique conjunctions often used in artist fields. + */ + String stopwordsForArtist = "by cv feat vs with"; + Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> { List articleTerms = toTermString(n, queryArticle); List indexArticleTerms = toTermString(n, queryArticle4Index); List stopedTerms = toTermString(n, queryStop); + List artistTerms = toTermString(n, stopwordsForArtist); switch (n) { - case FieldNames.FOLDER: - case FieldNames.MEDIA_TYPE: - case FieldNames.GENRE: - case FieldNames.ARTIST: case FieldNames.ALBUM: case FieldNames.TITLE: - // It is removed because it is included in ENGLISH_STOP_WORDS_SET. + // Deleted because it is a stopword. assertEquals("article : " + n, 0, articleTerms.size()); - // Not removed because it is not included in ENGLISH_STOP_WORDS_SET. - assertEquals("sonic server index article: " + n, 6, indexArticleTerms.size()); - // It is removed because it is included in ENGLISH_STOP_WORDS_SET. - assertEquals("non-article stop words : " + n, 0, stopedTerms.size()); + // "la los" is not deleted(#1235). + assertEquals("sonic server index article: " + n, 2, indexArticleTerms.size()); + // Not deleted because it is not a stopword. + assertEquals("non-article stop words : " + n, 30, stopedTerms.size()); + // Not deleted because it is not a stopword. + assertEquals("stop words for artsist : " + n, 5, artistTerms.size()); break; - // Legacy has common behavior for all fields. - default: + case FieldNames.ARTIST: + + // Deleted because it is a stopword. assertEquals("article : " + n, 0, articleTerms.size()); - assertEquals("sonic server index article: " + n, 6, indexArticleTerms.size()); - assertEquals("non-article stop words : " + n, 0, stopedTerms.size()); + // "la los" is not deleted(#1235). + assertEquals("sonic server index article: " + n, 2, indexArticleTerms.size()); + // Not deleted because it is not a stopword(Except by and with). + assertEquals("non-article stop words : " + n, 28, stopedTerms.size()); + // Deleted because it is a stopword. + assertEquals("stop words for artsist : " + n, 0, artistTerms.size()); + break; + + default: + fail(); // no analyze field is not applicable break; } }); @@ -239,14 +253,16 @@ public class AnalyzerFactoryTestCase { public void testStopwardAndFullWidth() { /* - * Stop word is removed. + * This and is not deleted because they are different from the default stopword. */ String queryHalfWidth = "THIS IS FULL-WIDTH SENTENCES."; List terms = toTermString(queryHalfWidth); - assertEquals(3, terms.size()); - assertEquals("full", terms.get(0)); - assertEquals("width", terms.get(1)); - assertEquals("sentences", terms.get(2)); + assertEquals(5, terms.size()); + assertEquals("this", terms.get(0)); + assertEquals("is", terms.get(1)); + assertEquals("full", terms.get(2)); + assertEquals("width", terms.get(3)); + assertEquals("sentences", terms.get(4)); /* * Legacy can avoid Stopward if it is full width. @@ -264,10 +280,12 @@ public class AnalyzerFactoryTestCase { * The filter order has been changed properly * as it is probably not a deliberate specification. */ - assertEquals(3, terms.size()); - assertEquals("full", terms.get(0)); - assertEquals("width", terms.get(1)); - assertEquals("sentences", terms.get(2)); + assertEquals(5, terms.size()); + assertEquals("this", terms.get(0)); + assertEquals("is", terms.get(1)); + assertEquals("full", terms.get(2)); + assertEquals("width", terms.get(3)); + assertEquals("sentences", terms.get(4)); } @@ -537,9 +555,11 @@ public class AnalyzerFactoryTestCase { */ String query = "This is Airsonic's analysis."; List terms = toTermString(query); - assertEquals(2, terms.size()); - assertEquals("airsonic", terms.get(0)); - assertEquals("analysis", terms.get(1)); + assertEquals(4, terms.size()); + assertEquals("this", terms.get(0));// Not deleted because it is not a stopword + assertEquals("is", terms.get(1));// Not deleted because it is not a stopword + assertEquals("airsonic", terms.get(2)); + assertEquals("analysis", terms.get(3)); /* * XXX 3.x -> 8.x : @@ -587,13 +607,18 @@ public class AnalyzerFactoryTestCase { */ String query = "This is formed with a form of the verb \"have\" and a past participl."; List terms = toTermString(query); - assertEquals(6, terms.size()); - assertEquals("formed", terms.get(0));// leave passive / not "form" - assertEquals("form", terms.get(1)); - assertEquals("verb", terms.get(2)); - assertEquals("have", terms.get(3)); - assertEquals("past", terms.get(4)); - assertEquals("participl", terms.get(5)); + assertEquals(11, terms.size()); + assertEquals("this", terms.get(0));// Not deleted because it is not a stopword + assertEquals("is", terms.get(1));// Not deleted because it is not a stopword + assertEquals("formed", terms.get(2));// leave passive / not "form" + assertEquals("with", terms.get(3));// Not deleted because it is not a stopword + assertEquals("form", terms.get(4)); + assertEquals("of", terms.get(5)); + assertEquals("verb", terms.get(6)); + assertEquals("have", terms.get(7)); + assertEquals("and", terms.get(8));// Not deleted because it is not a stopword + assertEquals("past", terms.get(9)); + assertEquals("participl", terms.get(10)); } diff --git a/airsonic-main/src/test/java/org/airsonic/player/service/search/SearchServiceStartWithStopwardsTestCase.java b/airsonic-main/src/test/java/org/airsonic/player/service/search/SearchServiceStartWithStopwardsTestCase.java index aea0639d..0a060b22 100644 --- a/airsonic-main/src/test/java/org/airsonic/player/service/search/SearchServiceStartWithStopwardsTestCase.java +++ b/airsonic-main/src/test/java/org/airsonic/player/service/search/SearchServiceStartWithStopwardsTestCase.java @@ -57,8 +57,8 @@ public class SearchServiceStartWithStopwardsTestCase extends AbstractAirsonicHom criteria.setQuery("will"); SearchResult result = searchService.search(criteria, folders, IndexType.ARTIST_ID3); - // XXX 3.x -> 8.x : The filter is properly applied to the input(Stopward) - Assert.assertEquals("Williams hit by \"will\" ", 0, result.getTotalHits()); + // Will hit because Airsonic's stopword is defined(#1235) + Assert.assertEquals("Williams hit by \"will\" ", 1, result.getTotalHits()); criteria.setQuery("the"); result = searchService.search(criteria, folders, IndexType.SONG);