diff --git a/Public/Get-HtmlTable.ps1 b/Public/Get-HtmlTable.ps1 index b613a98..4851d4f 100644 --- a/Public/Get-HtmlTable.ps1 +++ b/Public/Get-HtmlTable.ps1 @@ -9,36 +9,75 @@ function Get-HtmlTable { [int]$FirstDataRow=0, [Switch]$UseDefaultCredentials ) + if ($PSVersionTable.PSVersion.Major -gt 5 -and -not (Get-Command ConvertFrom-Html -ErrorAction SilentlyContinue)) { + # Invoke-WebRequest on .NET core doesn't have ParsedHtml so we need HtmlAgilityPack or similiar Justin Grote's PowerHTML wraps that nicely + throw "This version of PowerShell needs the PowerHTML module to process HTML Tables." + } $r = Invoke-WebRequest $Url -UseDefaultCredentials: $UseDefaultCredentials + $propertyNames = $Header - $table = $r.ParsedHtml.getElementsByTagName("table")[$TableIndex] - $propertyNames=$Header - $totalRows=@($table.rows).count + if ($PSVersionTable.PSVersion.Major -le 5) { + $table = $r.ParsedHtml.getElementsByTagName("table")[$TableIndex] + $totalRows=@($table.rows).count - for ($idx = $FirstDataRow; $idx -lt $totalRows; $idx++) { + for ($idx = $FirstDataRow; $idx -lt $totalRows; $idx++) { - $row = $table.rows[$idx] - $cells = @($row.cells) + $row = $table.rows[$idx] + $cells = @($row.cells) - if(!$propertyNames) { - if($cells[0].tagName -eq 'th') { - $propertyNames = @($cells | ForEach-Object {$_.innertext -replace ' ',''}) - } else { - $propertyNames = @(1..($cells.Count + 2) | Foreach-Object { "P$_" }) + if(!$propertyNames) { + if($cells[0].tagName -eq 'th') { + $propertyNames = @($cells | ForEach-Object {$_.innertext -replace ' ',''}) + } else { + $propertyNames = @(1..($cells.Count + 2) | Foreach-Object { "P$_" }) + } + continue } - continue + + $result = [ordered]@{} + + for($counter = 0; $counter -lt $cells.Count; $counter++) { + $propertyName = $propertyNames[$counter] + + if(!$propertyName) { $propertyName= '[missing]'} + $result.$propertyName= $cells[$counter].InnerText + } + + [PSCustomObject]$result } - - $result = [ordered]@{} - - for($counter = 0; $counter -lt $cells.Count; $counter++) { - $propertyName = $propertyNames[$counter] - - if(!$propertyName) { $propertyName= '[missing]'} - $result.$propertyName= $cells[$counter].InnerText + } + else { + $h = ConvertFrom-Html -Content $r.Content + if ($TableIndex -is [valuetype]) { $TableIndex += 1} + $rows = $h.SelectNodes("//table[$TableIndex]//tr") + if (-not $rows) {Write-Warning "Could not find rows for `"//table[$TableIndex]`" in $Url ."} + if ( -not $propertyNames) { + if ( $tableHeaders = $rows[$FirstDataRow].SelectNodes("th")) { + $propertyNames = $tableHeaders.foreach({[System.Web.HttpUtility]::HtmlDecode( $_.innerText ) -replace '\W+','_' -replace '(\w)_+$','$1' }) + $FirstDataRow += 1 + } + else { + $c = 0 + $propertyNames = $rows[$FirstDataRow].SelectNodes("td") | Foreach-Object { "P$c" ; $c ++ } + } + } + Write-Verbose ("Property names: " + ($propertyNames -join ",")) + foreach ($n in $FirstDataRow..($rows.Count-1)) { + $r = $rows[$n].SelectNodes("td|th") + if ($r -and $r.innerText -ne "" -and $r.count -gt $rows[$n].SelectNodes("th").count ) { + $c = 0 + $newObj = [ordered]@{} + foreach ($p in $propertyNames) { + $n = $null + #Join descentandts for cases where the text in the cell is split (e.g with a
). We also want to remove HTML codes, trim and convert unicode minus sign to "-" + $cellText = $r[$c].Descendants().where({$_.NodeType -eq "Text"}).foreach({[System.Web.HttpUtility]::HtmlDecode( $_.innerText ).Trim()}) -Join " " -replace "\u2212","-" + if ([double]::TryParse($cellText, [ref]$n)) {$newObj[$p] = $n } + else {$newObj[$p] = $cellText } + $c ++ + } + [pscustomObject]$newObj + } } - - [PSCustomObject]$result } }