在Spark - java中提取配置单元表分区 [英] Extract hive table partition in Spark - java

查看:271
本文介绍了在Spark - java中提取配置单元表分区的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

Spark中有没有办法只提取分区列名?
我使用的解决方法是使用 HiveContext show extended table like table_name >

解决方案

您可以使用class HiveMetaStoreClient 直接从 HiveMetaStore 进行查询。

这个类还被流行的APIS广泛使用,用于与
HiveMetaStore 进行交互,例如:Apache Drill


org.apache.hadoop.hive.metastore.api.Partition getPartition (字符串
db_name,String tbl_name,List part_vals)

org.apache.hadoop.hive.metastore.api.Partition getPartition (String db,String tableName ,String partName)
Map> getPartitionColumnStatistics (String
dbName,String tableName,List partNames,List
colNames)

获取给定分区列的统计信息给定dbName,tableName,多个分区和colName -s


$ b

列表 getPartitionsByNames (字符串
db_name ,字符串tbl_name,列表part_names)
通过分区名称列表获取分区。


另外,列表方法还有好吧..




List listPartitionNames(String db_name,String tbl_name,
List part_vals,short max_parts)



列表listPartitionNames(String dbName,String tblName,short
max)



列表listPartitions(String
db_name,String tbl_name,List part_vals,short max_parts)

列表listPartitions(String
db_name,String tbl_name,short max_parts)



示例代码片段1:



  import org.apache.hadoop.hive.conf.HiveConf; 

//测试程序
public class Test {
public static void main(String [] args){

HiveConf hiveConf = new HiveConf() ;
hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTCONNECTIONRETRIES,3);
hiveConf.setVar(HiveConf.ConfVars.METASTOREURIS,thrift:// host:port);

HiveMetaStoreConnector hiveMetaStoreConnector = new HiveMetaStoreConnector(hiveConf);
if(hiveMetaStoreConnector!= null){
System.out.print(hiveMetaStoreConnector.getAllPartitionInfo(tablename));





//定义这样的类

import com.google.common.base .Joiner;
导入com.google.common.collect.Lists;
导入org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Partition;
导入org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.thrift.TException;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormatter;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class HiveMetaStoreConnector {
private HiveConf hiveConf;
HiveMetaStoreClient hiveMetaStoreClient;

public HiveMetaStoreConnector(String msAddr,String msPort){
try {
hiveConf = new HiveConf();
hiveConf.setVar(HiveConf.ConfVars.METASTOREURIS,msAddr +:+ msPort);
hiveMetaStoreClient = new HiveMetaStoreClient(hiveConf);
} catch(MetaException e){
e.printStackTrace();
System.err.println(构造函数错误);
System.err.println(e.toString());
System.exit(-100);



public HiveMetaStoreConnector(HiveConf hiveConf){
try {
this.hiveConf = hiveConf;
hiveMetaStoreClient = new HiveMetaStoreClient(hiveConf);
} catch(MetaException e){
e.printStackTrace();
System.err.println(构造函数错误);
System.err.println(e.toString());
System.exit(-100);
}
}

public String getAllPartitionInfo(String dbName){
List< String> res = Lists.newArrayList();
尝试{
列表< String> tableList = hiveMetaStoreClient.getAllTables(dbName);
for(String tableName:tableList){
res.addAll(getTablePartitionInformation(dbName,tableName));
}
} catch(MetaException e){
e.printStackTrace();
System.out.println(getAllTableStatistic error);
System.out.println(e.toString());
System.exit(-100);
}

返回Joiner.on(\\\
)。join(res);
}

public List< String> getTablePartitionInformation(String dbName,String tableName){
List< String> partitionsInfo = Lists.newArrayList();
尝试{
列表< String> partitionNames = hiveMetaStoreClient.listPartitionNames(dbName,tableName,(short)10000);
列表<分区> partitions = hiveMetaStoreClient.listPartitions(dbName,tableName,(short)10000); (分区分区:分区)
{
StringBuffer sb = new StringBuffer();
sb.append(tableName);
sb.append(\t);
列表< String> partitionValues = partition.getValues();
if(partitionValues.size()<4){
int size = partitionValues.size();
for(int j = 0; j <4-size; j ++){
partitionValues.add(null);


sb.append(joiner.on(\t)。join(partitionValues));
sb.append(\t);
DateTime createDate = new DateTime((long)partition.getCreateTime()* 1000);
sb.append(createDate.toString(yyyy-MM-dd HH:mm:ss));
partitionsInfo.add(sb.toString());
}

} catch(TException e){
e.printStackTrace();
返回Arrays.asList(new String [] {请求上的错误+ tableName});
}

return partitionsInfo;
}

public String getAllTableStatistic(String dbName){
List< String> res = Lists.newArrayList();
尝试{
列表< String> tableList = hiveMetaStoreClient.getAllTables(dbName);
for(String tableName:tableList){
res.addAll(getTableColumnsInformation(dbName,tableName));
}
} catch(MetaException e){
e.printStackTrace();
System.out.println(getAllTableStatistic error);
System.out.println(e.toString());
System.exit(-100);
}

返回Joiner.on(\\\
)。join(res);
}

public List< String> getTableColumnsInformation(String dbName,String tableName){
try {
List< FieldSchema> fields = hiveMetaStoreClient.getFields(dbName,tableName);
列表< String> infs = Lists.newArrayList();
int cnt = 0;
for(FieldSchema fs:fields){
StringBuffer sb = new StringBuffer();
sb.append(tableName);
sb.append(\t);
sb.append(cnt);
sb.append(\t);
cnt ++;
sb.append(fs.getName());
sb.append(\t);
sb.append(fs.getType());
sb.append(\t);
sb.append(fs.getComment());
infs.add(sb.toString());
}

返回infs;

} catch(TException e){
e.printStackTrace();
System.out.println(getTableColumnsInformation error);
System.out.println(e.toString());
System.exit(-100);
返回null;


$ b code
$ b

示例代码片段示例2(



  import org.apache.hadoop.conf.Configuration; 
导入org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.Database;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.thrift.TException;

import javax.xml.crypto.Data;
import java.io.IOException;
import java.util.HashMap;

public class HiveMetaStoreClientTest {
public static void main(String [] args){

HiveConf hiveConf = null;
HiveMetaStoreClient hiveMetaStoreClient = null;
String dbName = null;

尝试{
hiveConf = HCatUtil.getHiveConf(new Configuration());
hiveMetaStoreClient = new HiveMetaStoreClient(hiveConf);

dbName = args [0];

getDatabase(hiveMetaStoreClient,dbName);

$ b} catch(MetaException e){
e.printStackTrace();
} catch(IOException e){
e.printStackTrace();
} catch(NoSuchObjectException e){
e.printStackTrace();
System.out.println(===============);
System.out.println(database+ args [0] +not exists);
System.out.println(===============);
createDatabase(hiveMetaStoreClient,dbName);
尝试{
getDatabase(hiveMetaStoreClient,dbName);
} catch(TException e1){
e1.printStackTrace();
System.out.println(TMD);
}
} catch(TException e){
e.printStackTrace();



public static Database getDatabase(HiveMetaStoreClient hiveMetaStoreClient,String dbName)throws TException {
Database database = null;

database = hiveMetaStoreClient.getDatabase(dbName);

System.out.println(database.getLocationUri());
System.out.println(database.getOwnerName()); (key):database.getParameters()。keySet()){
System.out.println(key +=+ database.getParameters()。get(key) );
}
返回数据库;
}

public static Database createDatabase(HiveMetaStoreClient hiveMetaStoreClient,String dbName){
HashMap< String,String> map = new HashMap< String,String>();
数据库数据库=新数据库(dbName,desc,null,map);
尝试{
hiveMetaStoreClient.createDatabase(database);
} catch(TException e){
e.printStackTrace();
System.out.println(some error);
}
返回数据库;
}
}


Is there any way in Spark to extract only partition column names? The workaround I am using is to run "show extended table like table_name" using HiveContext

解决方案

You can use class HiveMetaStoreClient to query directly from HiveMetaStore.

This class is widely used by popular APIS also, for interacting with HiveMetaStore for ex: Apache Drill

org.apache.hadoop.hive.metastore.api.Partition getPartition(String db_name, String tbl_name, List part_vals)

org.apache.hadoop.hive.metastore.api.Partition getPartition(String db, String tableName, String partName) Map> getPartitionColumnStatistics(String dbName, String tableName, List partNames, List colNames)

Get partitions column statistics given dbName, tableName, multiple partitions and colName-s

List getPartitionsByNames(String db_name, String tbl_name, List part_names) Get partitions by a list of partition names.

Also, there are list methods as well..

List listPartitionNames(String db_name, String tbl_name, List part_vals, short max_parts)

List listPartitionNames(String dbName, String tblName, short max)

List listPartitions(String db_name, String tbl_name, List part_vals, short max_parts)

List listPartitions(String db_name, String tbl_name, short max_parts)

Sample Code snippet 1 :

import org.apache.hadoop.hive.conf.HiveConf;

// test program
public class Test {
    public static void main(String[] args){

        HiveConf hiveConf = new HiveConf();
        hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTCONNECTIONRETRIES, 3);
        hiveConf.setVar(HiveConf.ConfVars.METASTOREURIS, "thrift://host:port");

        HiveMetaStoreConnector hiveMetaStoreConnector = new HiveMetaStoreConnector(hiveConf);
        if(hiveMetaStoreConnector != null){
            System.out.print(hiveMetaStoreConnector.getAllPartitionInfo("tablename"));
        }
    }
}


// define a class like this

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.thrift.TException;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormatter;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class HiveMetaStoreConnector {
    private HiveConf hiveConf;
    HiveMetaStoreClient hiveMetaStoreClient;

    public HiveMetaStoreConnector(String msAddr, String msPort){
        try {
            hiveConf = new HiveConf();
            hiveConf.setVar(HiveConf.ConfVars.METASTOREURIS, msAddr+":"+ msPort);
            hiveMetaStoreClient = new HiveMetaStoreClient(hiveConf);
        } catch (MetaException e) {
            e.printStackTrace();
            System.err.println("Constructor error");
            System.err.println(e.toString());
            System.exit(-100);
        }
    }

    public HiveMetaStoreConnector(HiveConf hiveConf){
        try {
            this.hiveConf = hiveConf;
            hiveMetaStoreClient = new HiveMetaStoreClient(hiveConf);
        } catch (MetaException e) {
            e.printStackTrace();
            System.err.println("Constructor error");
            System.err.println(e.toString());
            System.exit(-100);
        }
    }

    public String getAllPartitionInfo(String dbName){
        List<String> res = Lists.newArrayList();
        try {
            List<String> tableList = hiveMetaStoreClient.getAllTables(dbName);
            for(String tableName:tableList){
                res.addAll(getTablePartitionInformation(dbName,tableName));
            }
        } catch (MetaException e) {
            e.printStackTrace();
            System.out.println("getAllTableStatistic error");
            System.out.println(e.toString());
            System.exit(-100);
        }

        return Joiner.on("\n").join(res);
    }

    public List<String> getTablePartitionInformation(String dbName, String tableName){
        List<String> partitionsInfo = Lists.newArrayList();
        try {
            List<String> partitionNames = hiveMetaStoreClient.listPartitionNames(dbName,tableName, (short) 10000);
            List<Partition> partitions = hiveMetaStoreClient.listPartitions(dbName,tableName, (short) 10000);
            for(Partition partition:partitions){
                StringBuffer sb = new StringBuffer();
                sb.append(tableName);
                sb.append("\t");
                List<String> partitionValues = partition.getValues();
                if(partitionValues.size()<4){
                    int size = partitionValues.size();
                    for(int j=0; j<4-size;j++){
                        partitionValues.add("null");
                    }
                }
                sb.append(Joiner.on("\t").join(partitionValues));
                sb.append("\t");
                DateTime createDate = new DateTime((long)partition.getCreateTime()*1000);
                sb.append(createDate.toString("yyyy-MM-dd HH:mm:ss"));
                partitionsInfo.add(sb.toString());
            }

        } catch (TException e) {
            e.printStackTrace();
            return Arrays.asList(new String[]{"error for request on" + tableName});
        }

        return partitionsInfo;
    }

    public String getAllTableStatistic(String dbName){
        List<String> res = Lists.newArrayList();
        try {
            List<String> tableList = hiveMetaStoreClient.getAllTables(dbName);
            for(String tableName:tableList){
                res.addAll(getTableColumnsInformation(dbName,tableName));
            }
        } catch (MetaException e) {
            e.printStackTrace();
            System.out.println("getAllTableStatistic error");
            System.out.println(e.toString());
            System.exit(-100);
        }

        return Joiner.on("\n").join(res);
    }

    public List<String> getTableColumnsInformation(String dbName, String tableName){
        try {
            List<FieldSchema> fields = hiveMetaStoreClient.getFields(dbName, tableName);
            List<String> infs = Lists.newArrayList();
            int cnt = 0;
            for(FieldSchema fs : fields){
                StringBuffer sb = new StringBuffer();
                sb.append(tableName);
                sb.append("\t");
                sb.append(cnt);
                sb.append("\t");
                cnt++;
                sb.append(fs.getName());
                sb.append("\t");
                sb.append(fs.getType());
                sb.append("\t");
                sb.append(fs.getComment());
                infs.add(sb.toString());
            }

            return infs;

        } catch (TException e) {
            e.printStackTrace();
            System.out.println("getTableColumnsInformation error");
            System.out.println(e.toString());
            System.exit(-100);
            return null;
        }
    }
}

Sample code snippet example 2 (source)

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.Database;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.thrift.TException;

import javax.xml.crypto.Data;
import java.io.IOException;
import java.util.HashMap;

public class HiveMetaStoreClientTest {
    public static void main(String[] args) {

        HiveConf hiveConf = null;
        HiveMetaStoreClient hiveMetaStoreClient = null;
        String dbName = null;

        try {
            hiveConf = HCatUtil.getHiveConf(new Configuration());
            hiveMetaStoreClient = new HiveMetaStoreClient(hiveConf);

            dbName = args[0];

            getDatabase(hiveMetaStoreClient, dbName);


        } catch (MetaException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (NoSuchObjectException e) {
            e.printStackTrace();
            System.out.println("===============");
            System.out.println("database " + args[0] + "not exists");
            System.out.println("===============");
            createDatabase(hiveMetaStoreClient, dbName);
            try {
                getDatabase(hiveMetaStoreClient, dbName);
            } catch (TException e1) {
                e1.printStackTrace();
                System.out.println("TMD");
            }
        } catch (TException e) {
            e.printStackTrace();
        }
    }

    public static Database getDatabase(HiveMetaStoreClient hiveMetaStoreClient, String dbName) throws TException {
        Database database = null;

        database = hiveMetaStoreClient.getDatabase(dbName);

        System.out.println(database.getLocationUri());
        System.out.println(database.getOwnerName());

        for (String key : database.getParameters().keySet()) {
            System.out.println(key + " = " + database.getParameters().get(key));
        }
        return database;
    }

    public static Database createDatabase(HiveMetaStoreClient hiveMetaStoreClient, String dbName) {
        HashMap<String, String> map = new HashMap<String,String>();
        Database database = new Database(dbName, "desc", null, map);
        try {
            hiveMetaStoreClient.createDatabase(database);
        } catch (TException e) {
            e.printStackTrace();
            System.out.println("some error");
        }
        return database;
    }
}

这篇关于在Spark - java中提取配置单元表分区的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆