Fix Get-HTMLTable for .NET core

This commit is contained in:
James O'Neill
2022-06-17 08:19:55 +01:00
parent 6f2bfaff4e
commit 701b8e2062

View File

@@ -9,36 +9,75 @@ function Get-HtmlTable {
[int]$FirstDataRow=0,
[Switch]$UseDefaultCredentials
)
if ($PSVersionTable.PSVersion.Major -gt 5 -and -not (Get-Command ConvertFrom-Html -ErrorAction SilentlyContinue)) {
# Invoke-WebRequest on .NET core doesn't have ParsedHtml so we need HtmlAgilityPack or similiar Justin Grote's PowerHTML wraps that nicely
throw "This version of PowerShell needs the PowerHTML module to process HTML Tables."
}
$r = Invoke-WebRequest $Url -UseDefaultCredentials: $UseDefaultCredentials
$propertyNames = $Header
$table = $r.ParsedHtml.getElementsByTagName("table")[$TableIndex]
$propertyNames=$Header
$totalRows=@($table.rows).count
if ($PSVersionTable.PSVersion.Major -le 5) {
$table = $r.ParsedHtml.getElementsByTagName("table")[$TableIndex]
$totalRows=@($table.rows).count
for ($idx = $FirstDataRow; $idx -lt $totalRows; $idx++) {
for ($idx = $FirstDataRow; $idx -lt $totalRows; $idx++) {
$row = $table.rows[$idx]
$cells = @($row.cells)
$row = $table.rows[$idx]
$cells = @($row.cells)
if(!$propertyNames) {
if($cells[0].tagName -eq 'th') {
$propertyNames = @($cells | ForEach-Object {$_.innertext -replace ' ',''})
} else {
$propertyNames = @(1..($cells.Count + 2) | Foreach-Object { "P$_" })
if(!$propertyNames) {
if($cells[0].tagName -eq 'th') {
$propertyNames = @($cells | ForEach-Object {$_.innertext -replace ' ',''})
} else {
$propertyNames = @(1..($cells.Count + 2) | Foreach-Object { "P$_" })
}
continue
}
continue
$result = [ordered]@{}
for($counter = 0; $counter -lt $cells.Count; $counter++) {
$propertyName = $propertyNames[$counter]
if(!$propertyName) { $propertyName= '[missing]'}
$result.$propertyName= $cells[$counter].InnerText
}
[PSCustomObject]$result
}
$result = [ordered]@{}
for($counter = 0; $counter -lt $cells.Count; $counter++) {
$propertyName = $propertyNames[$counter]
if(!$propertyName) { $propertyName= '[missing]'}
$result.$propertyName= $cells[$counter].InnerText
}
else {
$h = ConvertFrom-Html -Content $r.Content
if ($TableIndex -is [valuetype]) { $TableIndex += 1}
$rows = $h.SelectNodes("//table[$TableIndex]//tr")
if (-not $rows) {Write-Warning "Could not find rows for `"//table[$TableIndex]`" in $Url ."}
if ( -not $propertyNames) {
if ( $tableHeaders = $rows[$FirstDataRow].SelectNodes("th")) {
$propertyNames = $tableHeaders.foreach({[System.Web.HttpUtility]::HtmlDecode( $_.innerText ) -replace '\W+','_' -replace '(\w)_+$','$1' })
$FirstDataRow += 1
}
else {
$c = 0
$propertyNames = $rows[$FirstDataRow].SelectNodes("td") | Foreach-Object { "P$c" ; $c ++ }
}
}
Write-Verbose ("Property names: " + ($propertyNames -join ","))
foreach ($n in $FirstDataRow..($rows.Count-1)) {
$r = $rows[$n].SelectNodes("td|th")
if ($r -and $r.innerText -ne "" -and $r.count -gt $rows[$n].SelectNodes("th").count ) {
$c = 0
$newObj = [ordered]@{}
foreach ($p in $propertyNames) {
$n = $null
#Join descentandts for cases where the text in the cell is split (e.g with a <BR> ). We also want to remove HTML codes, trim and convert unicode minus sign to "-"
$cellText = $r[$c].Descendants().where({$_.NodeType -eq "Text"}).foreach({[System.Web.HttpUtility]::HtmlDecode( $_.innerText ).Trim()}) -Join " " -replace "\u2212","-"
if ([double]::TryParse($cellText, [ref]$n)) {$newObj[$p] = $n }
else {$newObj[$p] = $cellText }
$c ++
}
[pscustomObject]$newObj
}
}
[PSCustomObject]$result
}
}