elasticsearch检索word文档(elastic search文档)

版本选择,参考:https://blog.csdn.net/2301_79098963/article/details/138275506

下载elasticsearch-7-10-0,选择windows版本,zip包解压到指定目录即可
https://www.elastic.co/downloads/past-releases/elasticsearch-7-10-0

对于word、pdf等文档类型的文件而言,它们文件底层的内容除了纯文本之外,还会有很多杂乱的信息(比如在一个word文件中,除了文本内容,还包含了页面设置、字体大小、颜色等无关信息)
为了剔除文档中与文本无关的信息,所以才需要使用文本抽取插件

安装文本抽取插件:ingest-attachment

windows下命令(进到bin目录):

elasticsearch-plugin install ingest-attachment

Linux下命令(进到bin目录):

./elasticsearch-plugin install ingest-attachment

安装完成后,可以看到:

为了方便后续检索文本,需要安装一个IK分词器插件(官方下载地址:https://github.com/medcl/elasticsearch-analysis-ik
官方里面也有说明如何进行下载。选择一个和你elasticsearch版本相同的版本进行下载即可,此时可能需要用到翻墙软件。比如执行以下命令:

windows下(进到bin目录):

elasticsearch-plugin install https://release.infinilabs.com/analysis-ik/stable/elasticsearch-analysis-ik-7.10.0.zip

elasticsearch启动:
K:\elasticsearch-7.10.0\bin>elasticsearch
回车即可

springboot集成elasticsearch
1、在yml中添加es的配置


#elasticsearch info
es:
elasticsearch:
url: 127.0.0.1
port: 9200
2、maven中引入相关jar

    
        org.springframework.boot
        spring-boot-starter-data-elasticsearch
    

3、创建客户端,目前一般创建HigeLevelClient。这个根据es的版本而定。
import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

@Configuration
public class ElasticsearchConfig {
@Value(“${es.elasticsearch.url}”)
private String esHost;

@Value("${es.elasticsearch.port}")
private int esPort;

@Bean
public RestHighLevelClient restHighLevelClient() {
    // 设置连接的用户名密码
    return new RestHighLevelClient(RestClient.builder(new HttpHost(esHost, esPort, "http")));
}

}

4、访问es
package cn.fss.elasticsearch;

import java.util.Arrays;

import org.elasticsearch.action.DocWriteResponse;
import org.elasticsearch.action.delete.DeleteRequest;
import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import com.alibaba.fastjson2.JSON;

import cn.fss.common.core.page.TableDataInfo;
import lombok.extern.slf4j.Slf4j;

@Component
@Slf4j
public class ElasticSearchClient {

@Autowired

private RestHighLevelClient restHighLevelClient;

/**
 * 获得关键词搜索结果
 * @param index
 * @param sourceBuilder
 * @return
 */
public TableDataInfo selectDocumentList(String index, SearchSourceBuilder sourceBuilder) {
    try {
        SearchRequest request = new SearchRequest(index);
        if (sourceBuilder != null) {
            // 返回实际命中数
            sourceBuilder.trackTotalHits(true);
            request.source(sourceBuilder);
        }
        SearchResponse response = restHighLevelClient.search(request, RequestOptions.DEFAULT);
        if (response.getHits() != null) {
            System.out.println("total========="+response.getHits().getTotalHits().value);
            TableDataInfo data = new TableDataInfo();
            data.setRows(Arrays.asList(response.getHits().getHits()));
            data.setTotal(response.getHits().getTotalHits().value);//命中总数
            return data;
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

/**
 * 插入/修改文档信息
 * @param index 索引
 * @param data  数据
 * @param docId
 */
public Boolean insertDocument(String index, String docId, Object data) {
    try {
        IndexRequest request = new IndexRequest(index);
        request.timeout(TimeValue.timeValueSeconds(10));//10秒
        request.id(docId);//如果先前已经上传过文件,此时再次插入会更新。
        // 重要!!必须设置管道
        request.setPipeline("attachment");//文件通道
        request.source(JSON.toJSONString(data), XContentType.JSON);
        IndexResponse response = restHighLevelClient.index(request, RequestOptions.DEFAULT);
        log.info("[es] 插入文档的响应状态: status:{},id:{}", response.status().getStatus(), response.getId());
        String status = response.status().toString();
        if ("CREATED".equals(status) || "OK".equals(status)) {
            log.debug("[es] 插入或修改文档成功! ");
            return true;
        }
    } catch (Exception e) {
        log.error("[es] 插入或修改文档失败",e);
    }
    return false;
}

/**
 * 删除某个index的docId
 * @param index
 * @param docId
 * @return
 */
public boolean deleteDocument(String index,String docId) {
    try {
        DeleteRequest request = new DeleteRequest(index,docId);
        DeleteResponse res = restHighLevelClient.delete(request, RequestOptions.DEFAULT);
        log.info("index:[{}],docId:[{}],进行删除操作,es返回的结果是:"+res.getResult());
        if(res.getResult() == DocWriteResponse.Result.DELETED) {
            return true;
        }
        
    }catch(Exception ex) {
        log.error(ex.getMessage(),ex);
    }
    return false;
}

}

如果存储的是word、pdf或txt文本格式,需要以下几个步骤
1、在es中创建文件通道,put方式,注意attachment,这个在java代码中会用到
http://localhost:9200/_ingest/pipeline/attachment
2、创建索引,put方式,docwrite就是索引名称,代码中也会用到
http://localhost:9200/docwrite
3、word、txt或pdf文件,需要转成base64
import java.uitl.Base64;


private String getFileBase64(File file) {
Base64.Encoder base64 = Base64.getEncoder();
String base64Str = null;
try (FileInputStream fis = new FileInputStream(file);
ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
byte[] b = new byte[1024];
int n;
while ((n = fis.read(b)) != -1) {
bos.write(b, 0, n);
}
base64Str = base64.encodeToString(bos.toByteArray());
} catch (Exception e) {
e.printStackTrace();
}
return base64Str;

}

4、service层代码
package cn.fss.elasticsearch.service.impl;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import cn.fss.elasticsearch.domain.SearchReq;
import cn.fss.common.core.page.TableDataInfo;
import cn.fss.common.utils.uuid.UUID;
import cn.fss.elasticsearch.ElasticSearchClient;
import cn.fss.elasticsearch.domain.DocumentObj;
import cn.fss.elasticsearch.service.ISearchSerice;
import lombok.extern.slf4j.Slf4j;

@Service
@Slf4j
public class SearchServiceImpl implements ISearchSerice {

@Autowired
private ElasticSearchClient esClient;

@Override
public TableDataInfo search(SearchReq req) {
    String keyword = req.getKeyword();//关键字
          //分页,采用from+size方式,其他方式网上搜索                  
    Integer pageSize  =req.getPageSize();
    Integer pageNum = req.getPageNum();
    Integer from = (pageNum-1)*pageSize;
    // 高亮查询,关键词添加红色样式
    HighlightBuilder highlightBuilder = new HighlightBuilder().field("attachment.content")
            .preTags("").postTags("");
    // 普通全索引查询
    SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder()
            .query(QueryBuilders.matchQuery("attachment.content", keyword).analyzer("ik_smart"))
            .from(from).size(pageSize) //分页
            .trackTotalHits(true) //返回总数
            .highlighter(highlightBuilder);
    TableDataInfo datas = esClient.selectDocumentList("docwrite", searchSourceBuilder);
    if(datas==null || CollectionUtils.isEmpty(datas.getRows())) {
        return null;
    }
    Long total = datas.getTotal();//总命中数
    
    @SuppressWarnings("unchecked")
    List searchHits = (List) datas.getRows();
    
    // 处理每一条记录(每一个文档),获得高亮文本。
    List results = new ArrayList();
    for (SearchHit hit : searchHits) {
        Map sourceAsMap = hit.getSourceAsMap();
        DocumentObj obj = new DocumentObj();
        obj.setDocId(((Integer) sourceAsMap.get("docId")).intValue());
        obj.setDocName((String) sourceAsMap.get("docName"));
        obj.setDocType((String) sourceAsMap.get("docType"));
        obj.setTitle((String) sourceAsMap.get("title"));

        HighlightField contentHighlightField = hit.getHighlightFields().get("attachment.content");
        // 对于一个文档,它的高亮文本有多个结果,这里只拼接前2个结果。
        Text[] fragments = contentHighlightField.fragments();
        StringBuffer highLightMessage = new StringBuffer();
        if (fragments != null && fragments.length > 0) {
            for (Text t : fragments) {
                highLightMessage.append(t.toString()).append("|");
            }
        }
        if (StringUtils.isNoneBlank(highLightMessage.toString())) {
            obj.setContent(highLightMessage.toString());
            results.add(obj);

        }
    }
    TableDataInfo rst = new TableDataInfo();
    rst.setRows(results);
    rst.setTotal(total);
    return rst;

}

@Override
public Boolean uploadFile(DocumentObj data) {
    try {
        //String id = UUID.randomUUID().toString().replaceAll("-", "").toUpperCase();
        Boolean rst= esClient.insertDocument("docwrite",data.getDocId()+"", data);
        return rst;
        
    } catch (Exception ex) {
        log.error(ex.getMessage(), ex);
    }
    return false;
}

@Override
public boolean deleteDocumnet(String index, String docId) {
    return esClient.deleteDocument(index, docId);
}

}

大数据

如何调试linux内核(如何调试linux内核源码)

2025-3-3 10:15:42

大数据

kafka单机版搭建(单机版kafka部署)

2025-3-3 10:15:45

0 条回复 A文章作者 M管理员
    暂无讨论,说说你的看法吧