如何提高数据库性能? [英] How to increase Databricks performance?
本文介绍了如何提高数据库性能?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!
问题描述
这是我的脚本,它是按多维数据集分组的:
cube_department_read = cube_department_df.cube(cube_department_df["YEAR"], cube_department_df["WeekOfYear"], cube_department_df["Month"],
cube_department_df["department_groups"], cube_department_df["category_name"],
cube_department_df["subcategory_name"], cube_department_df["section_name"])
.agg(F.max('last_date_of_week').alias('last_date_of_week'),
F.countDistinct('internal_tranx_key').alias('sales_basket'),
F.sum('SalesAmt').alias('sales_amt'),
F.sum('SalesQty').alias('sales_qty'),
F.sum('SalesQtyPro').alias('SalesQtyPro'),
F.sum('SalesAmtPro').alias('SalesAmtPro'),
F.countDistinct('membership_id').alias('member_count'),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_Member"),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_Member"),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
1).otherwise(0)).alias("Basket_Count_Member"),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
0).otherwise(cube_department_df["SalesQty"])).alias("SalesQty_NonMember"),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
0).otherwise(cube_department_df["SalesAmt"])).alias("SalesAmt_NonMember"),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
0).otherwise(1)).alias("Basket_Count_NonMember"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_MMDS_Promotion"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_MMDS_Promotion"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
1).otherwise(0)).alias("Basket_Count_MMDS_Promotion"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
0).otherwise(cube_department_df["SalesAmt"])).alias("SalesAmt_Non_MMDS_Promotion"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
0).otherwise(cube_department_df["SalesQty"])).alias("SalesQty_Non_MMDS_Promotion"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
0).otherwise(1)).alias("Basket_Count_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNotNull()),
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNotNull()),
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNotNull()),
1).otherwise(0)).alias("Basket_Count_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNull()),
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_Non_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNull()),
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_Non_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNull()),
1).otherwise(0)).alias("Basket_Count_Non_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNotNull()),
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_Member_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNotNull()),
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_Member_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNotNull()),
1).otherwise(0)).alias("Basket_Count_Member_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNull()),
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_Non_Member_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNull()),
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_Non_Member_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNull()),
1).otherwise(0)).alias("Basket_Count_Non_Member_Non_MMDS_Promotion"),
F.when((F.sum(cube_department_df["SalesQty"]) < 0) & (F.sum(cube_department_df["SalesAmt"]) < 0),
(F.sum(cube_department_df["SalesAmt"]) / F.sum(cube_department_df["SalesQty"])) * -1)
.when((F.sum(cube_department_df["SalesQty"]) == 0) | (F.sum(cube_department_df["SalesAmt"]) == 0),
0).otherwise(F.sum(cube_department_df["SalesAmt"]) / F.sum(cube_department_df["SalesQty"])).alias("sales_per_unit"),
F.when((F.sum(cube_department_df["SalesQty"]) < 0) & (F.sum(cube_department_df["SalesAmt"]) < 0),
(F.sum(cube_department_df["SalesAmt"]) / F.countDistinct(cube_department_df["internal_tranx_key"])) * -1)
.when((F.sum(cube_department_df["SalesQty"]) == 0) | (F.sum(cube_department_df["SalesAmt"]) == 0),
0).otherwise(F.sum(cube_department_df["SalesAmt"]) / F.countDistinct(cube_department_df["internal_tranx_key"])).alias("sales_per_basket"),
F.when((F.sum(cube_department_df["SalesQty"]) < 0) & (F.sum(cube_department_df["SalesAmt"]) < 0),
(F.sum(cube_department_df["SalesQty"]) / F.countDistinct(cube_department_df["internal_tranx_key"])) * -1)
.when((F.sum(cube_department_df["SalesQty"]) == 0) | (F.sum(cube_department_df["SalesAmt"]) == 0),
0).otherwise(F.sum(cube_department_df["SalesQty"]) / F.countDistinct(cube_department_df["internal_tranx_key"])).alias("unit_per_basket"),
F.when((F.countDistinct(cube_department_df["membership_id"]) < 0) & (F.sum(cube_department_df["SalesAmt"]) < 0),
(F.sum(cube_department_df["SalesAmt"]) / F.countDistinct(cube_department_df["membership_id"])) * -1)
.when((F.countDistinct(cube_department_df["membership_id"]) == 0) | (F.sum(cube_department_df["SalesAmt"]) == 0),
0).otherwise(F.sum(cube_department_df["SalesAmt"]) / F.countDistinct(cube_department_df["membership_id"])).alias("spend_per_customer"))
.select("YEAR","WeekOfYear","Month","department_groups","category_name","subcategory_name","section_name",
"last_date_of_week","sales_basket","sales_amt","sales_qty","SalesQtyPro","SalesAmtPro",
"member_count","SalesQty_Member","SalesAmt_Member", "Basket_Count_Member",
"SalesQty_NonMember","SalesAmt_NonMember", "Basket_Count_NonMember",
"SalesAmt_MMDS_Promotion", "SalesQty_MMDS_Promotion", "Basket_Count_MMDS_Promotion",
"SalesAmt_Non_MMDS_Promotion","SalesQty_Non_MMDS_Promotion", "Basket_Count_Non_MMDS_Promotion",
"SalesAmt_Member_MMDS_Promotion","SalesQty_Member_MMDS_Promotion","Basket_Count_Member_MMDS_Promotion",
"SalesAmt_Non_Member_MMDS_Promotion","SalesQty_Non_Member_MMDS_Promotion","Basket_Count_Non_Member_MMDS_Promotion",
"SalesAmt_Member_Non_MMDS_Promotion","SalesQty_Member_Non_MMDS_Promotion","Basket_Count_Member_Non_MMDS_Promotion",
"SalesAmt_Non_Member_Non_MMDS_Promotion","SalesQty_Non_Member_Non_MMDS_Promotion","Basket_Count_Non_Member_Non_MMDS_Promotion",
"sales_per_unit","sales_per_basket","unit_per_basket", "spend_per_customer")
.orderBy(F.col("YEAR").asc(),
F.col("WeekOfYear").asc(),
F.col("Month").asc(),
F.col("department_groups").asc(),
F.col("category_name").asc(),
F.col("subcategory_name").asc(),
F.col("section_name").asc())
这是我的垃圾收集
那么,从这里我能做些什么?我有一个包含1.51亿行的事实表资源。很抱歉,我是数据库新手,因为我需要执行多维数据集脚本,而尚不支持Synapse多维数据集,因此我需要在Databricks上执行此多维数据集。
推荐答案
从数据库写入Synapse的最慢部分是数据库写入临时目录(Azure Blob存储)的步骤。
数据库->;Blob Store不由Polybase处理。只有Blob Store->;Synapse/Azure DW是Polybase,而且该部分通常移动得很快。
您可以尝试更改写入语义:Databricks documentation
使用复制写入语义,您将能够更快地在Synapse中加载数据。
您可以在运行WRITE命令之前进行配置,方法如下:
spark.conf.set("spark.databricks.sqldw.writeSemantics", "copy")
这篇关于如何提高数据库性能?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!
查看全文