版本选择,参考:https://blog.csdn.net/2301_79098963/article/details/138275506
下载elasticsearch-7-10-0,选择windows版本,zip包解压到指定目录即可
https://www.elastic.co/downloads/past-releases/elasticsearch-7-10-0
对于word、pdf等文档类型的文件而言,它们文件底层的内容除了纯文本之外,还会有很多杂乱的信息(比如在一个word文件中,除了文本内容,还包含了页面设置、字体大小、颜色等无关信息)
为了剔除文档中与文本无关的信息,所以才需要使用文本抽取插件
安装文本抽取插件:ingest-attachment
windows下命令(进到bin目录):
elasticsearch-plugin install ingest-attachment
Linux下命令(进到bin目录):
./elasticsearch-plugin install ingest-attachment
安装完成后,可以看到:
为了方便后续检索文本,需要安装一个IK分词器插件(官方下载地址:https://github.com/medcl/elasticsearch-analysis-ik
官方里面也有说明如何进行下载。选择一个和你elasticsearch版本相同的版本进行下载即可,此时可能需要用到翻墙软件。比如执行以下命令:
windows下(进到bin目录):
elasticsearch-plugin install https://release.infinilabs.com/analysis-ik/stable/elasticsearch-analysis-ik-7.10.0.zip
elasticsearch启动:
K:\elasticsearch-7.10.0\bin>elasticsearch
回车即可
springboot集成elasticsearch
1、在yml中添加es的配置
…
…
#elasticsearch info
es:
elasticsearch:
url: 127.0.0.1
port: 9200
2、maven中引入相关jar
org.springframework.boot
spring-boot-starter-data-elasticsearch
3、创建客户端,目前一般创建HigeLevelClient。这个根据es的版本而定。
import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@Configuration
public class ElasticsearchConfig {
@Value(“${es.elasticsearch.url}”)
private String esHost;
@Value("${es.elasticsearch.port}")
private int esPort;
@Bean
public RestHighLevelClient restHighLevelClient() {
// 设置连接的用户名密码
return new RestHighLevelClient(RestClient.builder(new HttpHost(esHost, esPort, "http")));
}
}
4、访问es
package cn.fss.elasticsearch;
import java.util.Arrays;
import org.elasticsearch.action.DocWriteResponse;
import org.elasticsearch.action.delete.DeleteRequest;
import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import com.alibaba.fastjson2.JSON;
import cn.fss.common.core.page.TableDataInfo;
import lombok.extern.slf4j.Slf4j;
@Component
@Slf4j
public class ElasticSearchClient {
@Autowired
private RestHighLevelClient restHighLevelClient;
/**
* 获得关键词搜索结果
* @param index
* @param sourceBuilder
* @return
*/
public TableDataInfo selectDocumentList(String index, SearchSourceBuilder sourceBuilder) {
try {
SearchRequest request = new SearchRequest(index);
if (sourceBuilder != null) {
// 返回实际命中数
sourceBuilder.trackTotalHits(true);
request.source(sourceBuilder);
}
SearchResponse response = restHighLevelClient.search(request, RequestOptions.DEFAULT);
if (response.getHits() != null) {
System.out.println("total========="+response.getHits().getTotalHits().value);
TableDataInfo data = new TableDataInfo();
data.setRows(Arrays.asList(response.getHits().getHits()));
data.setTotal(response.getHits().getTotalHits().value);//命中总数
return data;
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 插入/修改文档信息
* @param index 索引
* @param data 数据
* @param docId
*/
public Boolean insertDocument(String index, String docId, Object data) {
try {
IndexRequest request = new IndexRequest(index);
request.timeout(TimeValue.timeValueSeconds(10));//10秒
request.id(docId);//如果先前已经上传过文件,此时再次插入会更新。
// 重要!!必须设置管道
request.setPipeline("attachment");//文件通道
request.source(JSON.toJSONString(data), XContentType.JSON);
IndexResponse response = restHighLevelClient.index(request, RequestOptions.DEFAULT);
log.info("[es] 插入文档的响应状态: status:{},id:{}", response.status().getStatus(), response.getId());
String status = response.status().toString();
if ("CREATED".equals(status) || "OK".equals(status)) {
log.debug("[es] 插入或修改文档成功! ");
return true;
}
} catch (Exception e) {
log.error("[es] 插入或修改文档失败",e);
}
return false;
}
/**
* 删除某个index的docId
* @param index
* @param docId
* @return
*/
public boolean deleteDocument(String index,String docId) {
try {
DeleteRequest request = new DeleteRequest(index,docId);
DeleteResponse res = restHighLevelClient.delete(request, RequestOptions.DEFAULT);
log.info("index:[{}],docId:[{}],进行删除操作,es返回的结果是:"+res.getResult());
if(res.getResult() == DocWriteResponse.Result.DELETED) {
return true;
}
}catch(Exception ex) {
log.error(ex.getMessage(),ex);
}
return false;
}
}
如果存储的是word、pdf或txt文本格式,需要以下几个步骤
1、在es中创建文件通道,put方式,注意attachment,这个在java代码中会用到
http://localhost:9200/_ingest/pipeline/attachment
2、创建索引,put方式,docwrite就是索引名称,代码中也会用到
http://localhost:9200/docwrite
3、word、txt或pdf文件,需要转成base64
import java.uitl.Base64;
…
…
private String getFileBase64(File file) {
Base64.Encoder base64 = Base64.getEncoder();
String base64Str = null;
try (FileInputStream fis = new FileInputStream(file);
ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
byte[] b = new byte[1024];
int n;
while ((n = fis.read(b)) != -1) {
bos.write(b, 0, n);
}
base64Str = base64.encodeToString(bos.toByteArray());
} catch (Exception e) {
e.printStackTrace();
}
return base64Str;
}
4、service层代码
package cn.fss.elasticsearch.service.impl;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import cn.fss.elasticsearch.domain.SearchReq;
import cn.fss.common.core.page.TableDataInfo;
import cn.fss.common.utils.uuid.UUID;
import cn.fss.elasticsearch.ElasticSearchClient;
import cn.fss.elasticsearch.domain.DocumentObj;
import cn.fss.elasticsearch.service.ISearchSerice;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class SearchServiceImpl implements ISearchSerice {
@Autowired
private ElasticSearchClient esClient;
@Override
public TableDataInfo search(SearchReq req) {
String keyword = req.getKeyword();//关键字
//分页,采用from+size方式,其他方式网上搜索
Integer pageSize =req.getPageSize();
Integer pageNum = req.getPageNum();
Integer from = (pageNum-1)*pageSize;
// 高亮查询,关键词添加红色样式
HighlightBuilder highlightBuilder = new HighlightBuilder().field("attachment.content")
.preTags("").postTags("");
// 普通全索引查询
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder()
.query(QueryBuilders.matchQuery("attachment.content", keyword).analyzer("ik_smart"))
.from(from).size(pageSize) //分页
.trackTotalHits(true) //返回总数
.highlighter(highlightBuilder);
TableDataInfo datas = esClient.selectDocumentList("docwrite", searchSourceBuilder);
if(datas==null || CollectionUtils.isEmpty(datas.getRows())) {
return null;
}
Long total = datas.getTotal();//总命中数
@SuppressWarnings("unchecked")
List searchHits = (List) datas.getRows();
// 处理每一条记录(每一个文档),获得高亮文本。
List results = new ArrayList();
for (SearchHit hit : searchHits) {
Map sourceAsMap = hit.getSourceAsMap();
DocumentObj obj = new DocumentObj();
obj.setDocId(((Integer) sourceAsMap.get("docId")).intValue());
obj.setDocName((String) sourceAsMap.get("docName"));
obj.setDocType((String) sourceAsMap.get("docType"));
obj.setTitle((String) sourceAsMap.get("title"));
HighlightField contentHighlightField = hit.getHighlightFields().get("attachment.content");
// 对于一个文档,它的高亮文本有多个结果,这里只拼接前2个结果。
Text[] fragments = contentHighlightField.fragments();
StringBuffer highLightMessage = new StringBuffer();
if (fragments != null && fragments.length > 0) {
for (Text t : fragments) {
highLightMessage.append(t.toString()).append("|");
}
}
if (StringUtils.isNoneBlank(highLightMessage.toString())) {
obj.setContent(highLightMessage.toString());
results.add(obj);
}
}
TableDataInfo rst = new TableDataInfo();
rst.setRows(results);
rst.setTotal(total);
return rst;
}
@Override
public Boolean uploadFile(DocumentObj data) {
try {
//String id = UUID.randomUUID().toString().replaceAll("-", "").toUpperCase();
Boolean rst= esClient.insertDocument("docwrite",data.getDocId()+"", data);
return rst;
} catch (Exception ex) {
log.error(ex.getMessage(), ex);
}
return false;
}
@Override
public boolean deleteDocumnet(String index, String docId) {
return esClient.deleteDocument(index, docId);
}
}