Merge pull request #1235 from tesshucom/improve-search-accuracy

Improve search accuracy
master
François-Xavier Thomas 5 years ago committed by GitHub
commit 2d30a37208
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 30
      airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java
  2. 2
      airsonic-main/src/main/java/org/airsonic/player/service/search/IndexManager.java
  3. 12
      airsonic-main/src/main/java/org/airsonic/player/service/search/IndexType.java
  4. 12
      airsonic-main/src/main/java/org/airsonic/player/service/search/QueryFactory.java
  5. 37
      airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords.txt
  6. 45
      airsonic-main/src/main/resources/org/airsonic/player/service/search/analysis/stopwords_artist.txt
  7. 89
      airsonic-main/src/test/java/org/airsonic/player/service/search/AnalyzerFactoryTestCase.java
  8. 12
      airsonic-main/src/test/java/org/airsonic/player/service/search/QueryFactoryTestCase.java
  9. 4
      airsonic-main/src/test/java/org/airsonic/player/service/search/SearchServiceStartWithStopwardsTestCase.java

@ -51,6 +51,10 @@ import static org.springframework.util.ObjectUtils.isEmpty;
@Component
public final class AnalyzerFactory {
private static final String STOP_WORDS = "org/airsonic/player/service/search/analysis/stopwords.txt";
private static final String STOP_WORDS_ARTIST = "org/airsonic/player/service/search/analysis/stopwords_artist.txt";
private Analyzer analyzer;
private Analyzer queryAnalyzer;
@ -105,7 +109,19 @@ public final class AnalyzerFactory {
.addTokenFilter(CJKWidthFilterFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
.addTokenFilter(LowerCaseFilterFactory.class)
.addTokenFilter(StopFilterFactory.class)
.addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
.addTokenFilter(EnglishPossessiveFilterFactory.class);
addTokenFilterForUnderscoreRemovalAroundToken(builder);
return builder;
}
private Builder createArtistAnalyzerBuilder() throws IOException {
Builder builder = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(CJKWidthFilterFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
.addTokenFilter(LowerCaseFilterFactory.class)
.addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
.addTokenFilter(EnglishPossessiveFilterFactory.class);
addTokenFilterForUnderscoreRemovalAroundToken(builder);
return builder;
@ -134,7 +150,15 @@ public final class AnalyzerFactory {
public Analyzer getAnalyzer() throws IOException {
if (isEmpty(analyzer)) {
try {
analyzer = createDefaultAnalyzerBuilder().build();
Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build();
Analyzer artistAnalyzer = createArtistAnalyzerBuilder().build();
Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
fieldAnalyzers.put(FieldNames.ARTIST, artistAnalyzer);
analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers);
} catch (IOException e) {
throw new IOException("Error when initializing Analyzer.", e);
}
@ -161,9 +185,11 @@ public final class AnalyzerFactory {
try {
Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build();
Analyzer artistAnalyzer = createArtistAnalyzerBuilder().build();
Analyzer genreAnalyzer = createGenreAnalyzerBuilder().build();
Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
fieldAnalyzers.put(FieldNames.ARTIST, artistAnalyzer);
fieldAnalyzers.put(FieldNames.GENRE, genreAnalyzer);
queryAnalyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers);

@ -74,7 +74,7 @@ public class IndexManager {
* DocumentFactory or the class that they use.
*
*/
private static final int INDEX_VERSION = 16;
private static final int INDEX_VERSION = 17;
/**
* Literal name of index top directory.

@ -39,7 +39,7 @@ public enum IndexType {
FieldNames.TITLE,
FieldNames.ARTIST),
boosts(
entry(FieldNames.TITLE, 2F))),
entry(FieldNames.TITLE, 1.1F))),
ALBUM(
fieldNames(
@ -47,7 +47,7 @@ public enum IndexType {
FieldNames.ARTIST),
// FieldNames.FOLDER), // XXX 3.x -> 8.x : Remove folder from multi-field search condition
boosts(
entry(FieldNames.ALBUM, 2F))),
entry(FieldNames.ALBUM, 1.1F))),
ALBUM_ID3(
fieldNames(
@ -55,20 +55,18 @@ public enum IndexType {
FieldNames.ARTIST),
// FieldNames.FOLDER_ID), // XXX 3.x -> 8.x : Remove folder from multi-field search condition
boosts(
entry(FieldNames.ALBUM, 2F))),
entry(FieldNames.ALBUM, 1.1F))),
ARTIST(
fieldNames(
FieldNames.ARTIST),
// FieldNames.FOLDER), // XXX 3.x -> 8.x : Remove folder from multi-field search condition
boosts(
entry(FieldNames.ARTIST, 1F))),
boosts()),
ARTIST_ID3(
fieldNames(
FieldNames.ARTIST),
boosts(
entry(FieldNames.ARTIST, 2F))),
boosts()),
;

@ -31,6 +31,7 @@ import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
@ -101,7 +102,8 @@ public class QueryFactory {
* - Self made parser process reduces one library dependency.
* - It is easy to make corrections later when changing the query to improve search accuracy.
*/
private Query createMultiFieldWildQuery(@NonNull String[] fieldNames, @NonNull String queryString)
private Query createMultiFieldWildQuery(@NonNull String[] fieldNames, @NonNull String queryString,
@NonNull IndexType indexType)
throws IOException {
BooleanQuery.Builder mainQuery = new BooleanQuery.Builder();
@ -117,7 +119,11 @@ public class QueryFactory {
while (stream.incrementToken()) {
String token = stream.getAttribute(CharTermAttribute.class).toString();
WildcardQuery wildcardQuery = new WildcardQuery(new Term(fieldName, token.concat(ASTERISK)));
fieldQuerys.add(wildcardQuery);
if (indexType.getBoosts().containsKey(fieldName)) {
fieldQuerys.add(new BoostQuery(wildcardQuery, indexType.getBoosts().get(fieldName)));
} else {
fieldQuerys.add(wildcardQuery);
}
}
fieldsQuerys.add(fieldQuerys);
}
@ -169,7 +175,7 @@ public class QueryFactory {
BooleanQuery.Builder mainQuery = new BooleanQuery.Builder();
Query multiFieldQuery = createMultiFieldWildQuery(indexType.getFields(), criteria.getQuery());
Query multiFieldQuery = createMultiFieldWildQuery(indexType.getFields(), criteria.getQuery(), indexType);
mainQuery.add(multiFieldQuery, Occur.MUST);
boolean isId3 = indexType == IndexType.ALBUM_ID3 || indexType == IndexType.ARTIST_ID3;

@ -0,0 +1,37 @@
# This file is part of Airsonic.
#
# Airsonic is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Airsonic is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Airsonic. If not, see <http://www.gnu.org/licenses/>.
# Copyright 2016 (C) Airsonic Authors
# This file defines stop words suitable for music search.
# See EnglishAnalyzer.ENGLISH_STOP_WORDS_SET for default stopwords.
#
# a, an, and, are, as, at,
# be, but, by, for, if, in,
# into, is, it, no, not, of,
# on, or, such, that, the,
# their, then, there, these,
# they, this, to, was, will,
# with
# Ignore articles that are used by default in the index.
# See SettingsService.DEFAULT_IGNORED_ARTICLES
a
an
the
el
las
le
les

@ -0,0 +1,45 @@
# This file is part of Airsonic.
#
# Airsonic is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Airsonic is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Airsonic. If not, see <http://www.gnu.org/licenses/>.
# Copyright 2016 (C) Airsonic Authors
# This file defines stop words suitable for music search.
# See EnglishAnalyzer.ENGLISH_STOP_WORDS_SET for default stopwords.
#
# a, an, and, are, as, at,
# be, but, by, for, if, in,
# into, is, it, no, not, of,
# on, or, such, that, the,
# their, then, there, these,
# they, this, to, was, will,
# with
# Ignore articles that are used by default in the index.
# See SettingsService.DEFAULT_IGNORED_ARTICLES
a
an
the
el
las
le
les
# Unique conjunctions often used in artist fields.
by
cv
feat
vs
with

@ -14,6 +14,7 @@ import java.util.List;
import static java.util.Arrays.asList;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
/**
* Test case for Analyzer.
@ -187,33 +188,46 @@ public class AnalyzerFactoryTestCase {
String queryStop = "and are as at be but by for if in into is it no not of on " //
+ "or such that their then there these they this to was will with";
/*
* Unique conjunctions often used in artist fields.
*/
String stopwordsForArtist = "by cv feat vs with";
Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> {
List<String> articleTerms = toTermString(n, queryArticle);
List<String> indexArticleTerms = toTermString(n, queryArticle4Index);
List<String> stopedTerms = toTermString(n, queryStop);
List<String> artistTerms = toTermString(n, stopwordsForArtist);
switch (n) {
case FieldNames.FOLDER:
case FieldNames.MEDIA_TYPE:
case FieldNames.GENRE:
case FieldNames.ARTIST:
case FieldNames.ALBUM:
case FieldNames.TITLE:
// It is removed because it is included in ENGLISH_STOP_WORDS_SET.
// Deleted because it is a stopword.
assertEquals("article : " + n, 0, articleTerms.size());
// Not removed because it is not included in ENGLISH_STOP_WORDS_SET.
assertEquals("sonic server index article: " + n, 6, indexArticleTerms.size());
// It is removed because it is included in ENGLISH_STOP_WORDS_SET.
assertEquals("non-article stop words : " + n, 0, stopedTerms.size());
// "la los" is not deleted(#1235).
assertEquals("sonic server index article: " + n, 2, indexArticleTerms.size());
// Not deleted because it is not a stopword.
assertEquals("non-article stop words : " + n, 30, stopedTerms.size());
// Not deleted because it is not a stopword.
assertEquals("stop words for artsist : " + n, 5, artistTerms.size());
break;
// Legacy has common behavior for all fields.
default:
case FieldNames.ARTIST:
// Deleted because it is a stopword.
assertEquals("article : " + n, 0, articleTerms.size());
assertEquals("sonic server index article: " + n, 6, indexArticleTerms.size());
assertEquals("non-article stop words : " + n, 0, stopedTerms.size());
// "la los" is not deleted(#1235).
assertEquals("sonic server index article: " + n, 2, indexArticleTerms.size());
// Not deleted because it is not a stopword(Except by and with).
assertEquals("non-article stop words : " + n, 28, stopedTerms.size());
// Deleted because it is a stopword.
assertEquals("stop words for artsist : " + n, 0, artistTerms.size());
break;
default:
fail(); // no analyze field is not applicable
break;
}
});
@ -239,14 +253,16 @@ public class AnalyzerFactoryTestCase {
public void testStopwardAndFullWidth() {
/*
* Stop word is removed.
* This and is not deleted because they are different from the default stopword.
*/
String queryHalfWidth = "THIS IS FULL-WIDTH SENTENCES.";
List<String> terms = toTermString(queryHalfWidth);
assertEquals(3, terms.size());
assertEquals("full", terms.get(0));
assertEquals("width", terms.get(1));
assertEquals("sentences", terms.get(2));
assertEquals(5, terms.size());
assertEquals("this", terms.get(0));
assertEquals("is", terms.get(1));
assertEquals("full", terms.get(2));
assertEquals("width", terms.get(3));
assertEquals("sentences", terms.get(4));
/*
* Legacy can avoid Stopward if it is full width.
@ -264,10 +280,12 @@ public class AnalyzerFactoryTestCase {
* The filter order has been changed properly
* as it is probably not a deliberate specification.
*/
assertEquals(3, terms.size());
assertEquals("full", terms.get(0));
assertEquals("width", terms.get(1));
assertEquals("sentences", terms.get(2));
assertEquals(5, terms.size());
assertEquals("this", terms.get(0));
assertEquals("is", terms.get(1));
assertEquals("full", terms.get(2));
assertEquals("width", terms.get(3));
assertEquals("sentences", terms.get(4));
}
@ -537,9 +555,11 @@ public class AnalyzerFactoryTestCase {
*/
String query = "This is Airsonic's analysis.";
List<String> terms = toTermString(query);
assertEquals(2, terms.size());
assertEquals("airsonic", terms.get(0));
assertEquals("analysis", terms.get(1));
assertEquals(4, terms.size());
assertEquals("this", terms.get(0));// Not deleted because it is not a stopword
assertEquals("is", terms.get(1));// Not deleted because it is not a stopword
assertEquals("airsonic", terms.get(2));
assertEquals("analysis", terms.get(3));
/*
* XXX 3.x -> 8.x :
@ -587,13 +607,18 @@ public class AnalyzerFactoryTestCase {
*/
String query = "This is formed with a form of the verb \"have\" and a past participl.";
List<String> terms = toTermString(query);
assertEquals(6, terms.size());
assertEquals("formed", terms.get(0));// leave passive / not "form"
assertEquals("form", terms.get(1));
assertEquals("verb", terms.get(2));
assertEquals("have", terms.get(3));
assertEquals("past", terms.get(4));
assertEquals("participl", terms.get(5));
assertEquals(11, terms.size());
assertEquals("this", terms.get(0));// Not deleted because it is not a stopword
assertEquals("is", terms.get(1));// Not deleted because it is not a stopword
assertEquals("formed", terms.get(2));// leave passive / not "form"
assertEquals("with", terms.get(3));// Not deleted because it is not a stopword
assertEquals("form", terms.get(4));
assertEquals("of", terms.get(5));
assertEquals("verb", terms.get(6));
assertEquals("have", terms.get(7));
assertEquals("and", terms.get(8));// Not deleted because it is not a stopword
assertEquals("past", terms.get(9));
assertEquals("participl", terms.get(10));
}

@ -123,13 +123,13 @@ public class QueryFactoryTestCase {
Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.ALBUM);
assertEquals("SearchAlbum",
"+((album:abc* artist:abc*) (album:def* artist:def*)) +(folder:" + PATH1
"+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) +(folder:" + PATH1
+ ")",
query.toString());
query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.ALBUM);
assertEquals("SearchAlbum",
"+((album:abc* artist:abc*) (album:def* artist:def*)) +(folder:" + PATH1
"+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) +(folder:" + PATH1
+ " folder:" + PATH2 + ")",
query.toString());
}
@ -143,11 +143,11 @@ public class QueryFactoryTestCase {
Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.SONG);
assertEquals("SearchSong",
"+((title:abc* artist:abc*) (title:def* artist:def*)) +(folder:" + PATH1 + ")",
"+(((title:abc*)^1.1 artist:abc*) ((title:def*)^1.1 artist:def*)) +(folder:" + PATH1 + ")",
query.toString());
query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.SONG);
assertEquals("SearchSong", "+((title:abc* artist:abc*) (title:def* artist:def*)) +(folder:" + PATH1
assertEquals("SearchSong", "+(((title:abc*)^1.1 artist:abc*) ((title:def*)^1.1 artist:def*)) +(folder:" + PATH1
+ " folder:" + PATH2 + ")", query.toString());
}
@ -178,13 +178,13 @@ public class QueryFactoryTestCase {
Query query = queryFactory.search(criteria, SINGLE_FOLDERS, IndexType.ALBUM_ID3);
assertEquals(
"SearchAlbumId3", "+((album:abc* artist:abc*) (album:def* artist:def*)) "
"SearchAlbumId3", "+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) "
+ "+(folderId:" + FID1 + ")",
query.toString());
query = queryFactory.search(criteria, MULTI_FOLDERS, IndexType.ALBUM_ID3);
assertEquals("SearchAlbumId3",
"+((album:abc* artist:abc*) (album:def* artist:def*)) +(folderId:"
"+(((album:abc*)^1.1 artist:abc*) ((album:def*)^1.1 artist:def*)) +(folderId:"
+ FID1 + " folderId:"
+ FID2 + ")",
query.toString());

@ -57,8 +57,8 @@ public class SearchServiceStartWithStopwardsTestCase extends AbstractAirsonicHom
criteria.setQuery("will");
SearchResult result = searchService.search(criteria, folders, IndexType.ARTIST_ID3);
// XXX 3.x -> 8.x : The filter is properly applied to the input(Stopward)
Assert.assertEquals("Williams hit by \"will\" ", 0, result.getTotalHits());
// Will hit because Airsonic's stopword is defined(#1235)
Assert.assertEquals("Williams hit by \"will\" ", 1, result.getTotalHits());
criteria.setQuery("the");
result = searchService.search(criteria, folders, IndexType.SONG);

Loading…
Cancel
Save