Merge pull request #1235 from tesshucom/improve-search-accuracy

Improve search accuracy
master
François-Xavier Thomas 5 years ago committed by GitHub
commit 2d30a37208
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 30
      airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java
  2. 2
      airsonic-main/src/main/java/org/airsonic/player/service/search/IndexManager.java
  3. 12
      airsonic-main/src/main/java/org/airsonic/player/service/search/IndexType.java
  4. 10
      airsonic-main/src/main/java/org/airsonic/player/service/search/QueryFactory.java
  5. 37
      airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords.txt
  6. 45
      airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords_artist.txt
  7. 89
      airsonic-main/src/test/java/org/airsonic/player/service/search/AnalyzerFactoryTestCase.java
  8. 12
      airsonic-main/src/test/java/org/airsonic/player/service/search/QueryFactoryTestCase.java
  9. 4
      airsonic-main/src/test/java/org/airsonic/player/service/search/SearchServiceStartWithStopwardsTestCase.java

@ -51,6 +51,10 @@ import static org.springframework.util.ObjectUtils.isEmpty;
@Component @Component
public final class AnalyzerFactory { public final class AnalyzerFactory {
private static final String STOP_WORDS = "org/airsonic/player/service/search/analysis/stopwords.txt";
private static final String STOP_WORDS_ARTIST = "org/airsonic/player/service/search/analysis/stopwords_artist.txt";
private Analyzer analyzer; private Analyzer analyzer;
private Analyzer queryAnalyzer; private Analyzer queryAnalyzer;
@ -105,7 +109,19 @@ public final class AnalyzerFactory {
.addTokenFilter(CJKWidthFilterFactory.class) .addTokenFilter(CJKWidthFilterFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false") .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
.addTokenFilter(LowerCaseFilterFactory.class) .addTokenFilter(LowerCaseFilterFactory.class)
.addTokenFilter(StopFilterFactory.class) .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
.addTokenFilter(EnglishPossessiveFilterFactory.class);
addTokenFilterForUnderscoreRemovalAroundToken(builder);
return builder;
}
private Builder createArtistAnalyzerBuilder() throws IOException {
Builder builder = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(CJKWidthFilterFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
.addTokenFilter(LowerCaseFilterFactory.class)
.addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
.addTokenFilter(EnglishPossessiveFilterFactory.class); .addTokenFilter(EnglishPossessiveFilterFactory.class);
addTokenFilterForUnderscoreRemovalAroundToken(builder); addTokenFilterForUnderscoreRemovalAroundToken(builder);
return builder; return builder;
@ -134,7 +150,15 @@ public final class AnalyzerFactory {
public Analyzer getAnalyzer() throws IOException { public Analyzer getAnalyzer() throws IOException {
if (isEmpty(analyzer)) { if (isEmpty(analyzer)) {
try { try {
analyzer = createDefaultAnalyzerBuilder().build();
Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build();
Analyzer artistAnalyzer = createArtistAnalyzerBuilder().build();
Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
fieldAnalyzers.put(FieldNames.ARTIST, artistAnalyzer);
analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers);
} catch (IOException e) { } catch (IOException e) {
throw new IOException("Error when initializing Analyzer.", e); throw new IOException("Error when initializing Analyzer.", e);
} }
@ -161,9 +185,11 @@ public final class AnalyzerFactory {
try { try {
Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build(); Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build();
Analyzer artistAnalyzer = createArtistAnalyzerBuilder().build();
Analyzer genreAnalyzer = createGenreAnalyzerBuilder().build(); Analyzer genreAnalyzer = createGenreAnalyzerBuilder().build();
Map<String, Analyzer> fieldAnalyzers = new HashMap<>(); Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
fieldAnalyzers.put(FieldNames.ARTIST, artistAnalyzer);
fieldAnalyzers.put(FieldNames.GENRE, genreAnalyzer); fieldAnalyzers.put(FieldNames.GENRE, genreAnalyzer);
queryAnalyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers); queryAnalyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers);

@ -74,7 +74,7 @@ public class IndexManager {
* DocumentFactory or the class that they use. * DocumentFactory or the class that they use.
* *
*/ */
private static final int INDEX_VERSION = 16; private static final int INDEX_VERSION = 17;
/** /**
* Literal name of index top directory. * Literal name of index top directory.

@ -39,7 +39,7 @@ public enum IndexType {
FieldNames.TITLE, FieldNames.TITLE,
FieldNames.ARTIST), FieldNames.ARTIST),
boosts( boosts(
entry(FieldNames.TITLE, 2F))), entry(FieldNames.TITLE, 1.1F))),
ALBUM( ALBUM(
fieldNames( fieldNames(
@ -47,7 +47,7 @@ public enum IndexType {
FieldNames.ARTIST), FieldNames.ARTIST),
// FieldNames.FOLDER), // XXX 3.x -> 8.x : Remove folder from multi-field search condition // FieldNames.FOLDER), // XXX 3.x -> 8.x : Remove folder from multi-field search condition
boosts( boosts(
entry(FieldNames.ALBUM, 2F))), entry(FieldNames.ALBUM, 1.1F))),
ALBUM_ID3( ALBUM_ID3(
fieldNames( fieldNames(
@ -55,20 +55,18 @@ public enum IndexType {
FieldNames.ARTIST), FieldNames.ARTIST),
// FieldNames.FOLDER_ID), // XXX 3.x -> 8.x : Remove folder from multi-field search condition // FieldNames.FOLDER_ID), // XXX 3.x -> 8.x : Remove folder from multi-field search condition
boosts( boosts(
entry(FieldNames.ALBUM, 2F))), entry(FieldNames.ALBUM, 1.1F))),
ARTIST( ARTIST(
fieldNames( fieldNames(
FieldNames.ARTIST), FieldNames.ARTIST),
// FieldNames.FOLDER), // XXX 3.x -> 8.x : Remove folder from multi-field search condition // FieldNames.FOLDER), // XXX 3.x -> 8.x : Remove folder from multi-field search condition
boosts( boosts()),
entry(FieldNames.ARTIST, 1F))),
ARTIST_ID3( ARTIST_ID3(
fieldNames( fieldNames(
FieldNames.ARTIST), FieldNames.ARTIST),
boosts( boosts()),
entry(FieldNames.ARTIST, 2F))),
; ;

@ -31,6 +31,7 @@ import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.WildcardQuery;
@ -101,7 +102,8 @@ public class QueryFactory {
* - Self made parser process reduces one library dependency. * - Self made parser process reduces one library dependency.
* - It is easy to make corrections later when changing the query to improve search accuracy. * - It is easy to make corrections later when changing the query to improve search accuracy.
*/ */
private Query createMultiFieldWildQuery(@NonNull String[] fieldNames, @NonNull String queryString) private Query createMultiFieldWildQuery(@NonNull String[] fieldNames, @NonNull String queryString,
@NonNull IndexType indexType)
throws IOException { throws IOException {
BooleanQuery.Builder mainQuery = new BooleanQuery.Builder(); BooleanQuery.Builder mainQuery = new BooleanQuery.Builder();
@ -117,8 +119,12 @@ public class QueryFactory {
while (stream.incrementToken()) { while (stream.incrementToken()) {
String token = stream.getAttribute(CharTermAttribute.class).toString(); String token = stream.getAttribute(CharTermAttribute.class).toString();
WildcardQuery wildcardQuery = new WildcardQuery(new Term(fieldName, token.concat(ASTERISK))); WildcardQuery wildcardQuery = new WildcardQuery(new Term(fieldName, token.concat(ASTERISK)));
if (indexType.getBoosts().containsKey(fieldName)) {
fieldQuerys.add(new BoostQuery(wildcardQuery, indexType.getBoosts().get(fieldName)));
} else {
fieldQuerys.add(wildcardQuery); fieldQuerys.add(wildcardQuery);
} }
}
fieldsQuerys.add(fieldQuerys); fieldsQuerys.add(fieldQuerys);
} }
} }
@ -169,7 +175,7 @@ public class QueryFactory {
BooleanQuery.Builder mainQuery = new BooleanQuery.Builder(); BooleanQuery.Builder mainQuery = new BooleanQuery.Builder();
Query multiFieldQuery = createMultiFieldWildQuery(indexType.getFields(), criteria.getQuery()); Query multiFieldQuery = createMultiFieldWildQuery(indexType.getFields(), criteria.getQuery(), indexType);
mainQuery.add(multiFieldQuery, Occur.MUST); mainQuery.add(multiFieldQuery, Occur.MUST);
boolean isId3 = indexType == IndexType.ALBUM_ID3 || indexType == IndexType.ARTIST_ID3; boolean isId3 = indexType == IndexType.ALBUM_ID3 || indexType == IndexType.ARTIST_ID3;

@ -0,0 +1,37 @@
# This file is part of Airsonic.
#
# Airsonic is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Airsonic is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Airsonic. If not, see <http://www.gnu.org/licenses/>.
# Copyright 2016 (C) Airsonic Authors
# This file defines stop words suitable for music search.
# See EnglishAnalyzer.ENGLISH_STOP_WORDS_SET for default stopwords.
#
# a, an, and, are, as, at,
# be, but, by, for, if, in,
# into, is, it, no, not, of,
# on, or, such, that, the,
# their, then, there, these,
# they, this, to, was, will,
# with
# Ignore articles that are used by default in the index.
# See SettingsService.DEFAULT_IGNORED_ARTICLES
a
an
the
el
las
le
les

@ -0,0 +1,45 @@
# This file is part of Airsonic.
#
# Airsonic is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Airsonic is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Airsonic. If not, see <http://www.gnu.org/licenses/>.
# Copyright 2016 (C) Airsonic Authors
# This file defines stop words suitable for music search.
# See EnglishAnalyzer.ENGLISH_STOP_WORDS_SET for default stopwords.
#
# a, an, and, are, as, at,
# be, but, by, for, if, in,
# into, is, it, no, not, of,
# on, or, such, that, the,
# their, then, there, these,
# they, this, to, was, will,
# with
# Ignore articles that are used by default in the index.
# See SettingsService.DEFAULT_IGNORED_ARTICLES
a
an
the
el
las
le
les
# Unique conjunctions often used in artist fields.
by
cv
feat
vs
with

@ -14,6 +14,7 @@ import java.util.List;
import static java.util.Arrays.asList; import static java.util.Arrays.asList;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
/** /**
* Test case for Analyzer. * Test case for Analyzer.
@ -187,33 +188,46 @@ public class AnalyzerFactoryTestCase {
String queryStop = "and are as at be but by for if in into is it no not of on " // String queryStop = "and are as at be but by for if in into is it no not of on " //
+ "or such that their then there these they this to was will with"; + "or such that their then there these they this to was will with";
/*
* Unique conjunctions often used in artist fields.
*/
String stopwordsForArtist = "by cv feat vs with";
Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> { Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> {
List<String> articleTerms = toTermString(n, queryArticle); List<String> articleTerms = toTermString(n, queryArticle);
List<String> indexArticleTerms = toTermString(n, queryArticle4Index); List<String> indexArticleTerms = toTermString(n, queryArticle4Index);
List<String> stopedTerms = toTermString(n, queryStop); List<String> stopedTerms = toTermString(n, queryStop);
List<String> artistTerms = toTermString(n, stopwordsForArtist);
switch (n) { switch (n) {
case FieldNames.FOLDER:
case FieldNames.MEDIA_TYPE:
case FieldNames.GENRE:
case FieldNames.ARTIST:
case FieldNames.ALBUM: case FieldNames.ALBUM:
case FieldNames.TITLE: case FieldNames.TITLE:
// It is removed because it is included in ENGLISH_STOP_WORDS_SET. // Deleted because it is a stopword.
assertEquals("article : " + n, 0, articleTerms.size()); assertEquals("article : " + n, 0, articleTerms.size());
// Not removed because it is not included in ENGLISH_STOP_WORDS_SET. // "la los" is not deleted(#1235).
assertEquals("sonic server index article: " + n, 6, indexArticleTerms.size()); assertEquals("sonic server index article: " + n, 2, indexArticleTerms.size());
// It is removed because it is included in ENGLISH_STOP_WORDS_SET. // Not deleted because it is not a stopword.
assertEquals("non-article stop words : " + n, 0, stopedTerms.size()); assertEquals("non-article stop words : " + n, 30, stopedTerms.size());
// Not deleted because it is not a stopword.
assertEquals("stop words for artsist : " + n, 5, artistTerms.size());
break; break;
// Legacy has common behavior for all fields. case FieldNames.ARTIST:
default:
// Deleted because it is a stopword.
assertEquals("article : " + n, 0, articleTerms.size()); assertEquals("article : " + n, 0, articleTerms.size());
assertEquals("sonic server index article: " + n, 6, indexArticleTerms.size()); // "la los" is not deleted(#1235).
assertEquals("non-article stop words : " + n, 0, stopedTerms.size()); assertEquals("sonic server index article: " + n, 2, indexArticleTerms.size());
// Not deleted because it is not a stopword(Except by and with).
assertEquals("non-article stop words : " + n, 28, stopedTerms.size());
// Deleted because it is a stopword.
assertEquals("stop words for artsist : " + n, 0, artistTerms.size());
break;
default:
fail(); // no analyze field is not applicable
break; break;
} }
}); });
@ -239,14 +253,16 @@ public class AnalyzerFactoryTestCase {
public void testStopwardAndFullWidth() { public void testStopwardAndFullWidth() {
/* /*
* Stop word is removed. * This and is not deleted because they are different from the default stopword.
*/ */
String queryHalfWidth = "THIS IS FULL-WIDTH SENTENCES."; String queryHalfWidth = "THIS IS FULL-WIDTH SENTENCES.";
List<String> terms = toTermString(queryHalfWidth); List<String> terms = toTermString(queryHalfWidth);
assertEquals(3, terms.size()); assertEquals(5, terms.size());
assertEquals("full", terms.get(0)); assertEquals("this", terms.get(0));
assertEquals("width", terms.get(1)); assertEquals("is", terms.get(1));
assertEquals("sentences", terms.get(2)); assertEquals("full", terms.get(2));
assertEquals("width", terms.get(3));
assertEquals("sentences", terms.get(4));
/* /*
* Legacy can avoid Stopward if it is full width. * Legacy can avoid Stopward if it is full width.
@ -264,10 +280,12 @@ public class AnalyzerFactoryTestCase {
* The filter order has been changed properly * The filter order has been changed properly
* as it is probably not a deliberate specification. * as it is probably not a deliberate specification.
*/ */
assertEquals(3, terms.size()); assertEquals(5, terms.size());
assertEquals("full", terms.get(0)); assertEquals("this", terms.get(0));
assertEquals("width", terms.get(1)); assertEquals("is", terms.get(1));
assertEquals("sentences", terms.get(2)); assertEquals("full", terms.get(2));
assertEquals("width", terms.get(3));
assertEquals("sentences", terms.get(4));
} }
@ -537,9 +555,11 @@ public class AnalyzerFactoryTestCase {
*/ */
String query = "This is Airsonic's analysis."; String query = "This is Airsonic's analysis.";
List<String> terms = toTermString(query); List<String> terms = toTermString(query);
assertEquals(2, terms.size()); assertEquals(4, terms.size());
assertEquals("airsonic", terms.get(0)); assertEquals("this", terms.get(0));// Not deleted because it is not a stopword
assertEquals("analysis", terms.get(1)); assertEquals("is", terms.get(1));// Not deleted because it is not a stopword
assertEquals("airsonic", terms.get(2));
assertEquals("analysis", terms.get(3));
/* /*
* XXX 3.x -> 8.x : * XXX 3.x -> 8.x :
@ -587,13 +607,18 @@ public class AnalyzerFactoryTestCase {
*/ */
String query = "This is formed with a form of the verb \"have\" and a past participl."; String query = "This is formed with a form of the verb \"have\" and a past participl.";
List<String> terms = toTermString(query); List<String> terms = toTermString(query);
assertEquals(6, terms.size()); assertEquals(11, terms.size());
assertEquals("formed", terms.get(0));// leave passive / not "form" assertEquals("this", terms.get(0));// Not deleted because it is not a stopword
assertEquals("form", terms.get(1)); assertEquals("is", terms.get(1));// Not deleted because it is not a stopword
assertEquals("verb", terms.get(2)); assertEquals("formed", terms.get(2));// leave passive / not "form"
assertEquals("have", terms.get(3)); assertEquals("with", terms.get(3));// Not deleted because it is not a stopword
assertEquals("past", terms.get(4)); assertEquals("form", terms.get(4));
assertEquals("participl", terms.get(5)); assertEquals("of", terms.get(5));
assertEquals("verb", terms.get(6));
assertEquals("have", terms.get(7));
assertEquals("and", terms.get(8));// Not deleted because it is not a stopword
assertEquals("past", terms.get(9));
assertEquals("participl", terms.get(10));
} }

@ -123,13 +123,13 @@ public class QueryFactoryTestCase {
Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.ALBUM); Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.ALBUM);
assertEquals("SearchAlbum", assertEquals("SearchAlbum",
"+((album:abc* artist:abc*) (album:def* artist:def*)) +(folder:" + PATH1 "+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) +(folder:" + PATH1
+ ")", + ")",
query.toString()); query.toString());
query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.ALBUM); query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.ALBUM);
assertEquals("SearchAlbum", assertEquals("SearchAlbum",
"+((album:abc* artist:abc*) (album:def* artist:def*)) +(folder:" + PATH1 "+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) +(folder:" + PATH1
+ " folder:" + PATH2 + ")", + " folder:" + PATH2 + ")",
query.toString()); query.toString());
} }
@ -143,11 +143,11 @@ public class QueryFactoryTestCase {
Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.SONG); Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.SONG);
assertEquals("SearchSong", assertEquals("SearchSong",
"+((title:abc* artist:abc*) (title:def* artist:def*)) +(folder:" + PATH1 + ")", "+(((title:abc*)^1.1 artist:abc*) ((title:def*)^1.1 artist:def*)) +(folder:" + PATH1 + ")",
query.toString()); query.toString());
query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.SONG); query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.SONG);
assertEquals("SearchSong", "+((title:abc* artist:abc*) (title:def* artist:def*)) +(folder:" + PATH1 assertEquals("SearchSong", "+(((title:abc*)^1.1 artist:abc*) ((title:def*)^1.1 artist:def*)) +(folder:" + PATH1
+ " folder:" + PATH2 + ")", query.toString()); + " folder:" + PATH2 + ")", query.toString());
} }
@ -178,13 +178,13 @@ public class QueryFactoryTestCase {
Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.ALBUM_ID3); Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.ALBUM_ID3);
assertEquals( assertEquals(
"SearchAlbumId3", "+((album:abc* artist:abc*) (album:def* artist:def*)) " "SearchAlbumId3", "+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) "
+ "+(folderId:" + FID1 + ")", + "+(folderId:" + FID1 + ")",
query.toString()); query.toString());
query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.ALBUM_ID3); query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.ALBUM_ID3);
assertEquals("SearchAlbumId3", assertEquals("SearchAlbumId3",
"+((album:abc* artist:abc*) (album:def* artist:def*)) +(folderId:" "+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) +(folderId:"
+ FID1 + " folderId:" + FID1 + " folderId:"
+ FID2 + ")", + FID2 + ")",
query.toString()); query.toString());

@ -57,8 +57,8 @@ public class SearchServiceStartWithStopwardsTestCase extends AbstractAirsonicHom
criteria.setQuery("will"); criteria.setQuery("will");
SearchResult result = searchService.search(criteria, folders, IndexType.ARTIST_ID3); SearchResult result = searchService.search(criteria, folders, IndexType.ARTIST_ID3);
// XXX 3.x -> 8.x : The filter is properly applied to the input(Stopward) // Will hit because Airsonic's stopword is defined(#1235)
Assert.assertEquals("Williams hit by \"will\" ", 0, result.getTotalHits()); Assert.assertEquals("Williams hit by \"will\" ", 1, result.getTotalHits());
criteria.setQuery("the"); criteria.setQuery("the");
result = searchService.search(criteria, folders, IndexType.SONG); result = searchService.search(criteria, folders, IndexType.SONG);

Loading…
Cancel
Save