任何加快此excel导入速度的方法吗? [英] Any way to speed up this excel import?
问题描述
我有一个Excel文档,该文档大约有250000行,需要永久导入.我已经对该导入进行了许多变体,但是有一些要求: -需要验证每个单元格中的数据 -必须检查数据库中是否存在重复项 -如果存在重复项,请更新条目 -如果不存在任何条目,请插入一个新条目
我已尽可能多地使用并行化,但是我确信必须有某种方法才能使此导入运行得更快.任何帮助或想法将不胜感激.
请注意,数据库位于LAN上,是的,我知道我还没有使用参数化的sql命令(还).
public string BulkUserInsertAndUpdate()
{
DateTime startTime = DateTime.Now;
try
{
ProcessInParallel();
Debug.WriteLine("Time taken: " + (DateTime.Now - startTime));
}
catch (Exception ex)
{
return ex.Message;
}
return "";
}
private IEnumerable<Row> ReadDocument()
{
using (SpreadsheetDocument spreadSheetDocument = SpreadsheetDocument.Open(_fileName, false))
{
WorkbookPart workbookPart = spreadSheetDocument.WorkbookPart;
Sheet ss = workbookPart.Workbook.Descendants<Sheet>().SingleOrDefault(s => s.Name == "User");
if (ss == null)
throw new Exception("There was a problem trying to import the file. Please insure that the Sheet's name is: User");
WorksheetPart worksheetPart = (WorksheetPart)workbookPart.GetPartById(ss.Id);
OpenXmlReader reader = OpenXmlReader.Create(worksheetPart);
StringTablePart = workbookPart.SharedStringTablePart;
while (reader.Read())
{
if (reader.ElementType == typeof(Row))
{
do
{
if (reader.HasAttributes)
{
var rowNum = int.Parse(reader.Attributes.First(a => a.LocalName == "r").Value);
if (rowNum == 1)
continue;
var row = (Row)reader.LoadCurrentElement();
yield return row;
}
} while (reader.ReadNextSibling()); // Skip to the next row
break; // We just looped through all the rows so no need to continue reading the worksheet
}
}
}
}
private void ProcessInParallel()
{
// Use ConcurrentQueue to enable safe enqueueing from multiple threads.
var exceptions = new ConcurrentQueue<Exception>();
Parallel.ForEach(ReadDocument(), (row, loopState) =>
{
List<Cell> cells = row.Descendants<Cell>().ToList();
if (string.IsNullOrEmpty(GetCellValue(cells[0], StringTablePart)))
return;
// validation code goes here....
try
{
using (SqlConnection connection = new SqlConnection("user id=sa;password=D3vAdm!n@;server=196.30.181.143;database=TheUnlimitedUSSD;MultipleActiveResultSets=True"))
{
connection.Open();
SqlCommand command = new SqlCommand("SELECT count(*) FROM dbo.[User] WHERE MobileNumber = '" + mobileNumber + "'", connection);
var userCount = (int) command.ExecuteScalar();
if (userCount > 0)
{
// update
command = new SqlCommand("UPDATE [user] SET NewMenu = " + (newMenuIndicator ? "1" : "0") + ", PolicyNumber = '" + policyNumber + "', Status = '" + status + "' WHERE MobileNumber = '" + mobileNumber + "'", connection);
command.ExecuteScalar();
Debug.WriteLine("Update cmd");
}
else
{
// insert
command = new SqlCommand("INSERT INTO dbo.[User] ( MobileNumber , Status , PolicyNumber , NewMenu ) VALUES ( '" + mobileNumber + "' , '" + status + "' , '" + policyNumber + "' , " + (newMenuIndicator ? "1" : "0") + " )", connection);
command.ExecuteScalar();
Debug.WriteLine("Insert cmd");
}
}
}
catch (Exception ex)
{
exceptions.Enqueue(ex);
Debug.WriteLine(ex.Message);
loopState.Break();
}
});
// Throw the exceptions here after the loop completes.
if (exceptions.Count > 0)
throw new AggregateException(exceptions);
}
我建议您进行批量导入,而无需对中间表进行任何验证,然后再通过SQL进行所有验证.现在,电子表格的数据将采用类似SQL表格的结构. 这就是我从Excel和CSV导入300万行以上的工业强度所做的工作,取得了巨大的成功.
I have an Excel document that has about 250000 rows which takes forever to import. I have done many variations of this import, however there are a few requirements: - Need to validate the data in each cell - Must check if a duplicate exists in the database - If a duplicate exists, update the entry - If no entry exists, insert a new one
I have used parallelization as much as possible however I am sure that there must be some way to get this import to run much faster. Any assistance or ideas would be greatly appreciated.
Note that the database is on a LAN, and yes I know I haven't used parameterized sql commands (yet).
public string BulkUserInsertAndUpdate()
{
DateTime startTime = DateTime.Now;
try
{
ProcessInParallel();
Debug.WriteLine("Time taken: " + (DateTime.Now - startTime));
}
catch (Exception ex)
{
return ex.Message;
}
return "";
}
private IEnumerable<Row> ReadDocument()
{
using (SpreadsheetDocument spreadSheetDocument = SpreadsheetDocument.Open(_fileName, false))
{
WorkbookPart workbookPart = spreadSheetDocument.WorkbookPart;
Sheet ss = workbookPart.Workbook.Descendants<Sheet>().SingleOrDefault(s => s.Name == "User");
if (ss == null)
throw new Exception("There was a problem trying to import the file. Please insure that the Sheet's name is: User");
WorksheetPart worksheetPart = (WorksheetPart)workbookPart.GetPartById(ss.Id);
OpenXmlReader reader = OpenXmlReader.Create(worksheetPart);
StringTablePart = workbookPart.SharedStringTablePart;
while (reader.Read())
{
if (reader.ElementType == typeof(Row))
{
do
{
if (reader.HasAttributes)
{
var rowNum = int.Parse(reader.Attributes.First(a => a.LocalName == "r").Value);
if (rowNum == 1)
continue;
var row = (Row)reader.LoadCurrentElement();
yield return row;
}
} while (reader.ReadNextSibling()); // Skip to the next row
break; // We just looped through all the rows so no need to continue reading the worksheet
}
}
}
}
private void ProcessInParallel()
{
// Use ConcurrentQueue to enable safe enqueueing from multiple threads.
var exceptions = new ConcurrentQueue<Exception>();
Parallel.ForEach(ReadDocument(), (row, loopState) =>
{
List<Cell> cells = row.Descendants<Cell>().ToList();
if (string.IsNullOrEmpty(GetCellValue(cells[0], StringTablePart)))
return;
// validation code goes here....
try
{
using (SqlConnection connection = new SqlConnection("user id=sa;password=D3vAdm!n@;server=196.30.181.143;database=TheUnlimitedUSSD;MultipleActiveResultSets=True"))
{
connection.Open();
SqlCommand command = new SqlCommand("SELECT count(*) FROM dbo.[User] WHERE MobileNumber = '" + mobileNumber + "'", connection);
var userCount = (int) command.ExecuteScalar();
if (userCount > 0)
{
// update
command = new SqlCommand("UPDATE [user] SET NewMenu = " + (newMenuIndicator ? "1" : "0") + ", PolicyNumber = '" + policyNumber + "', Status = '" + status + "' WHERE MobileNumber = '" + mobileNumber + "'", connection);
command.ExecuteScalar();
Debug.WriteLine("Update cmd");
}
else
{
// insert
command = new SqlCommand("INSERT INTO dbo.[User] ( MobileNumber , Status , PolicyNumber , NewMenu ) VALUES ( '" + mobileNumber + "' , '" + status + "' , '" + policyNumber + "' , " + (newMenuIndicator ? "1" : "0") + " )", connection);
command.ExecuteScalar();
Debug.WriteLine("Insert cmd");
}
}
}
catch (Exception ex)
{
exceptions.Enqueue(ex);
Debug.WriteLine(ex.Message);
loopState.Break();
}
});
// Throw the exceptions here after the loop completes.
if (exceptions.Count > 0)
throw new AggregateException(exceptions);
}
I would have suggested that you do a bulk import WITHOUT any validation to an intermediary table, and only then do all the validation via SQL. Your spreadsheet's data will now be in a similiar structure as a SQL table. This is what I have done with industrial strenght imports of 3 million rows + from Excel and CSV with great success.
这篇关于任何加快此excel导入速度的方法吗?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!