airsonic-custom/airsonic-main/src/main/java/org/airsonic/player/service/search/AnalyzerFactory.java

/*
 This file is part of Airsonic.

 Airsonic is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 Airsonic is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with Airsonic.  If not, see <http://www.gnu.org/licenses/>.

 Copyright 2016 (C) Airsonic Authors
 Based upon Subsonic, Copyright 2009 (C) Sindre Mehus
 */

package org.airsonic.player.service.search;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKWidthFilterFactory;
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer.Builder;
import org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import static org.springframework.util.ObjectUtils.isEmpty;

/**
 * Analyzer provider.
 * This class is a division of what was once part of SearchService and added functionality.
 * This class provides Analyzer which is used at index generation
 * and QueryAnalyzer which analyzes the specified query at search time.
 * Analyzer can be closed but is a reuse premise.
 * It is held in this class.
 */
@Component
public final class AnalyzerFactory {

    private static final String STOP_WORDS = "org/airsonic/player/service/search/analysis/stopwords.txt";

    private static final String STOP_WORDS_ARTIST = "org/airsonic/player/service/search/analysis/stopwords_artist.txt";

    private Analyzer analyzer;

    private Analyzer queryAnalyzer;

    /*
     * XXX 3.x -> 8.x : Convert UAX#29 Underscore Analysis to Legacy Analysis
     *
     * Because changes in underscores before and after words
     * have a major effect on user's forward match search.
     *
     * @see AnalyzerFactoryTestCase
     */
    private void addTokenFilterForUnderscoreRemovalAroundToken(Builder builder) throws IOException {
        builder
            .addTokenFilter(PatternReplaceFilterFactory.class,
                    "pattern", "^\\_", "replacement", "", "replace", "all")
            .addTokenFilter(PatternReplaceFilterFactory.class,
                    "pattern", "\\_$", "replacement", "", "replace", "all");
    }

    /*
     * XXX 3.x -> 8.x : Handle brackets correctly
     *
     * Process the input value of Genre search for search of domain value.
     *
     * The tag parser performs special character conversion
     * when converting input values from a file.
     * Therefore, the domain value may be different from the original value.
     * This filter allows searching by user readable value (file tag value).
     *
     * @see org.jaudiotagger.tag.id3.framebody.FrameBodyTCON#convertID3v23GenreToGeneric
     * (TCON stands for Genre with ID3 v2.3-v2.4)
     * Such processing exists because brackets in the Gener string have a special meaning.
     */
    private void addTokenFilterForTokenToDomainValue(Builder builder) throws IOException {
        builder
            .addTokenFilter(PatternReplaceFilterFactory.class,
                    "pattern", "\\(", "replacement", "", "replace", "all")
            .addTokenFilter(PatternReplaceFilterFactory.class,
                    "pattern", "\\)$", "replacement", "", "replace", "all")
            .addTokenFilter(PatternReplaceFilterFactory.class,
                    "pattern", "\\)", "replacement", " ", "replace", "all")
            .addTokenFilter(PatternReplaceFilterFactory.class,
                    "pattern", "\\{\\}", "replacement", "\\{ \\}", "replace", "all")
            .addTokenFilter(PatternReplaceFilterFactory.class,
                    "pattern", "\\[\\]", "replacement", "\\[ \\]", "replace", "all");
    }

    private Builder createDefaultAnalyzerBuilder() throws IOException {
        Builder builder = CustomAnalyzer.builder()
                .withTokenizer(StandardTokenizerFactory.class)
                .addTokenFilter(CJKWidthFilterFactory.class)
                .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
                .addTokenFilter(LowerCaseFilterFactory.class)
                .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
                .addTokenFilter(EnglishPossessiveFilterFactory.class);
        addTokenFilterForUnderscoreRemovalAroundToken(builder);
        return builder;
    }

    private Builder createArtistAnalyzerBuilder() throws IOException {
        Builder builder = CustomAnalyzer.builder()
                .withTokenizer(StandardTokenizerFactory.class)
                .addTokenFilter(CJKWidthFilterFactory.class)
                .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
                .addTokenFilter(LowerCaseFilterFactory.class)
                .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
                .addTokenFilter(EnglishPossessiveFilterFactory.class);
        addTokenFilterForUnderscoreRemovalAroundToken(builder);
        return builder;
    }

    private Builder createKeywordAnalyzerBuilder() throws IOException {
        return CustomAnalyzer.builder()
                .withTokenizer(KeywordTokenizerFactory.class);
    }

    private Builder createGenreAnalyzerBuilder() throws IOException {
        Builder builder = createKeywordAnalyzerBuilder();
        addTokenFilterForTokenToDomainValue(builder);
        return builder;
    }

    /**
     * Returns the Analyzer to use when generating the index.
     *
     * Whether this analyzer is applied to input values depends on
     * the definition of the document's fields.
     *
     * @return analyzer for index
     * @see DocumentFactory
     */
    public Analyzer getAnalyzer() throws IOException {
        if (isEmpty(analyzer)) {
            try {

                Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build();
                Analyzer artistAnalyzer = createArtistAnalyzerBuilder().build();

                Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
                fieldAnalyzers.put(FieldNames.ARTIST, artistAnalyzer);

                analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers);

            } catch (IOException e) {
                throw new IOException("Error when initializing Analyzer.", e);
            }
        }
        return analyzer;
    }

    /**
     * Returns the analyzer to use when generating a query for index search.
     *
     * String processing handled by QueryFactory
     * is limited to Lucene's modifier.
     *
     * The processing of the operands is expressed
     * in the AnalyzerFactory implementation.
     * Rules for tokenizing/converting input values
     * should not be described in QueryFactory.
     *
     * @return analyzer for query
     * @see QueryFactory
     */
    public Analyzer getQueryAnalyzer() throws IOException {
        if (isEmpty(queryAnalyzer)) {
            try {

                Analyzer defaultAnalyzer = createDefaultAnalyzerBuilder().build();
                Analyzer artistAnalyzer = createArtistAnalyzerBuilder().build();
                Analyzer genreAnalyzer = createGenreAnalyzerBuilder().build();

                Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
                fieldAnalyzers.put(FieldNames.ARTIST, artistAnalyzer);
                fieldAnalyzers.put(FieldNames.GENRE, genreAnalyzer);

                queryAnalyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers);

            } catch (IOException e) {
                throw new IOException("Error when initializing Analyzer.", e);
            }
        }
        return queryAnalyzer;
    }

}