将数据库列拆分为多值Solr字段 [英] Splitting database column into multivalued Solr field

查看:108
本文介绍了将数据库列拆分为多值Solr字段的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我很努力地试图弄清楚如何使数据导入处理程序的splitBy构造起作用.我期望它将输入列拆分为多值字段.这是一个重现该问题的测试用例:

I'm going nuts trying to figure out how to get the Data Import Handler's splitBy construct to work. I was expecting it to split the input column into a multivalued field. Here's a test case to reproduce the problem:

import java.io.File;
import java.io.IOException;
import java.sql.SQLException;

import static org.junit.Assert.*;

import javax.sql.DataSource;

import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.io.FileUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.core.CoreContainer;
import org.hsqldb.jdbc.JDBCDataSource;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

public class TestSplitBy {

  SolrServer server;
  File configPath = new File(FileUtils.getTempDirectory(), Long.toString(System.nanoTime()));

  String solrconfig_xml = "<config><luceneMatchVersion>LUCENE_41</luceneMatchVersion><requestHandler name=\"search\" class=\"solr.SearchHandler\" default=\"true\"><lst name=\"defaults\"><str name=\"fl\">*</str><str name=\"df\">id</str></lst></requestHandler><requestHandler name=\"/dataimport\" class=\"org.apache.solr.handler.dataimport.DataImportHandler\"><lst name=\"defaults\"><str name=\"config\">data-config.xml</str></lst></requestHandler></config>";

  String data_config_xml = "<dataConfig>" +
      "<dataSource url=\"jdbc:hsqldb:mem:testdb\" user=\"SA\" driver=\"org.hsqldb.jdbc.JDBCDriver\" />" +
      "<document>" +
      "<entity name=\"item\" transformer=\"RegexTransformer\" query=\"SELECT * FROM test\">" +
      "<field column=\"type\" name=\"type\" splitBy=\",\" />" +
      "</entity>" +
      "</document>" +
      "</dataConfig>";

  String schema_xml = "<schema version=\"1.3\" name=\"test\">" +
      "<types>" +
      "<fieldType name=\"string\" class=\"solr.StrField\" sortMissingLast=\"true\" omitNorms=\"true\" />" +
      "</types>" +
      "<fields>" +
      "<field stored=\"true\" name=\"id\" type=\"string\" />" +
      "<field stored=\"true\" name=\"type\" type=\"string\" multiValued=\"true\"/>" +
      "</fields>" +
      "<uniqueKey>id</uniqueKey>" +
      "</schema>";

  DataSource getDataSource() {
    JDBCDataSource ds = new JDBCDataSource();
    ds.setUser("SA");
    ds.setUrl("mem:testdb");
    return ds;
  }

  void populateDb(DataSource ds) {
    QueryRunner runner = new QueryRunner(ds);
    try {
      runner.update("DROP TABLE test IF EXISTS");
      runner.update("CREATE TABLE test(id INTEGER, type VARCHAR(256));");
      runner.update("INSERT INTO test VALUES 1, 'foo,bar,baz'");
    } catch (SQLException e) {
      System.err.println(e);
    }
  }

  void writeSolrConfig() throws IOException {
    File corePath = new File(configPath, "collection1");
    corePath.mkdir();
    File confPath = new File(corePath, "conf");
    confPath.mkdir();
    FileUtils.write(new File(confPath, "data-config.xml"), data_config_xml);
    FileUtils.write(new File(confPath, "schema.xml"), schema_xml);
    FileUtils.write(new File(confPath, "solrconfig.xml"), solrconfig_xml);
  }

  void startSolr() {
    System.setProperty("solr.solr.home", configPath.getAbsolutePath());
    CoreContainer.Initializer initializer = new CoreContainer.Initializer();
    CoreContainer coreContainer = initializer.initialize();
    server = new EmbeddedSolrServer(coreContainer, "collection1");
  }

  @Before
  public void setup() throws IOException {
    populateDb(getDataSource());
    writeSolrConfig();
    startSolr();
  }

  @After
  public void tearDown() {
    server.shutdown();
    FileUtils.deleteQuietly(configPath);
  }

  @Test
  public void testSplitBy() throws Exception {
    SolrQuery query = new SolrQuery();
    query.set("qt", "/dataimport");
    query.setParam("command", "full-import");
    QueryResponse response = server.query(query);
    Thread.sleep(500);

    response = server.query(new SolrQuery("*:*"));
    for (SolrDocument doc: response.getResults()) {
      assertNotNull(doc.getFieldValues("type"));
      assertEquals(3, doc.getFieldValues("type").size());
    }
  }
}

以及测试用例的POM:

And the POM for the test case:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>org.example</groupId>
  <artifactId>solr</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <name>Solr Sanity</name>

  <build>
    <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>2.3.2</version>
        <configuration>
          <source>1.6</source>
          <target>1.6</target>
        </configuration>
      </plugin>
    </plugins>
  </build>

  <dependencies>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr</artifactId>
      <version>4.1.0</version>
      <type>war</type>
    </dependency>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr-dataimporthandler</artifactId>
      <version>4.1.0</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr-solrj</artifactId>
      <version>4.1.0</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>commons-dbutils</groupId>
      <artifactId>commons-dbutils</artifactId>
      <version>1.5</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>org.hsqldb</groupId>
      <artifactId>hsqldb</artifactId>
      <version>2.2.9</version>
      <type>jar</type>
      <scope>runtime</scope>
    </dependency>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
    </dependency>
    <dependency>
      <groupId>javax.servlet</groupId>
      <artifactId>servlet-api</artifactId>
      <version>2.5</version>
    </dependency>
  </dependencies>
</project>

是否有关于如何将这些类型正确拆分为多个值的见解?

Any insight on how to get those types to split correctly into multiple values?

推荐答案

结果证明此单元测试存在一些问题:

Turns out there were a couple of issues with this unit test:

  1. HSQL的列名区分大小写(默认为大写).

  1. HSQL's column names are case sensitive (and default to upper case).

如果Solr字段名称和db列名称相同,则还会添加具有整个db值的额外令牌.

If the Solr field name and the db column name are identical an extra token with the entire db value is also added.

字段定义应类似于:

<field column="solrField" splitBy="," sourceColName="TYPE" />

并且通常-使用RegexTransformer将数据库中的单值字段与多值字段混合时:

And in general - when using the RegexTransformer to mix single valued fields from a DB with multivalued fields:

  • 如果使用splitBy,则column属性是Solr字段的名称. sourceColName是数据库列

  • If using splitBy then the column attribute is the name of the Solr field. The sourceColName is the database column

如果使用splitBy,则column属性是数据库列名称,而name属性是Solr字段.

If not using splitBy then the column attribute is the database column name and the name attribute is the Solr field.

这篇关于将数据库列拆分为多值Solr字段的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆