Merge pull request #1200 from jhoneill/master

Update to Get-HTMLTable, added Import-ByColuumns function and example
This commit is contained in:
Doug Finke
2022-06-17 12:04:06 -04:00
committed by GitHub
8 changed files with 219 additions and 79 deletions

View File

@@ -15,6 +15,6 @@ param(
$galleryUrl = "https://www.powershellgallery.com/packages/$moduleName"
$nolegend = '-nolegend'
if($chartType -eq 'pie') {$nolegend = $null}
$code = "$($chartType)Chart (Get-HtmlTable $galleryUrl 0 | sort lastupdated -desc) -title 'Download stats for $moduleName' $nolegend"
$code = "$($chartType)Chart (Get-HtmlTable $galleryUrl -FirstDataRow 1 | sort lastupdated -desc) -title 'Download stats for $moduleName' $nolegend"
$code | Invoke-Expression

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,146 @@
function Import-ByColumns {
<#
.synopsis
Works like Import-Excel but with data in columns instead of the conventional rows.
.Description.
Import-excel will read the sample file in this folder like this
> Import-excel FruitCity.xlsx | ft *
GroupAs Apple Orange Banana
------- ----- ------ ------
London 1 4 9
Paris 2 4 10
NewYork 6 5 11
Munich 7 8 12
Import-ByColumns transposes it
> Import-Bycolumns FruitCity.xlsx | ft *
GroupAs London Paris NewYork Munich
------- ------ ----- ------- ------
Apple 1 2 6 7
Orange 4 4 5 8
Banana 9 10 11 12
.Example
C:\> Import-Bycolumns -path .\VM_Build_Example.xlsx -StartRow 7 -EndRow 21 -EndColumn 7 -HeaderName Desc,size,type,
cpu,ram,NetAcc,OS,OSDiskSize,DataDiskSize,LogDiskSize,TempDbDiskSize,BackupDiskSize,ImageDiskDize,AzureBackup,AzureReplication | ft -a *
This reads a spreadsheet which has a block from row 7 to 21 containing 14 properties of virtual machines.
The properties names are in column A and the 6 VMS are in columns B-G
Because the property names are written for easy reading by the person completing the spreadsheet, they are replaced with new names.
All the parameters work as they would for Import-Excel
#>
[Diagnostics.CodeAnalysis.SuppressMessageAttribute("PSAvoidUsingPlainTextForPassword", "")]
param(
[Alias('FullName')]
[Parameter(ParameterSetName = "PathA", Mandatory, ValueFromPipelineByPropertyName, ValueFromPipeline, Position = 0 )]
[Parameter(ParameterSetName = "PathB", Mandatory, ValueFromPipelineByPropertyName, ValueFromPipeline, Position = 0 )]
[Parameter(ParameterSetName = "PathC", Mandatory, ValueFromPipelineByPropertyName, ValueFromPipeline, Position = 0 )]
[String]$Path,
[Parameter(ParameterSetName = "PackageA", Mandatory)]
[Parameter(ParameterSetName = "PackageB", Mandatory)]
[Parameter(ParameterSetName = "PackageC", Mandatory)]
[OfficeOpenXml.ExcelPackage]$ExcelPackage,
[Alias('Sheet')]
[Parameter(Position = 1)]
[ValidateNotNullOrEmpty()]
[String]$WorksheetName,
[Parameter(ParameterSetName = 'PathB' , Mandatory)]
[Parameter(ParameterSetName = 'PackageB', Mandatory)]
[String[]]$HeaderName ,
[Parameter(ParameterSetName = 'PathC' , Mandatory)]
[Parameter(ParameterSetName = 'PackageC', Mandatory)]
[Switch]$NoHeader,
[Alias('TopRow')]
[ValidateRange(1, 9999)]
[Int]$StartRow = 1,
[Alias('StopRow', 'BottomRow')]
[Int]$EndRow ,
[Alias('LeftColumn','LabelColumn')]
[Int]$StartColumn = 1,
[Int]$EndColumn,
[switch]$DataOnly,
[switch]$AsHash,
[ValidateNotNullOrEmpty()]
[String]$Password
)
function Get-PropertyNames {
<#
.SYNOPSIS
Create objects containing the row number and the row name for each of the different header types.
#>
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseSingularNouns', '', Justification = "Name would be incorrect, and command is not exported")]
param(
[Parameter(Mandatory)]
[Int[]]$Rows,
[Parameter(Mandatory)]
[Int]$StartColumn
)
if ($HeaderName) {
$i = 0
foreach ($h in $HeaderName) {
$h | Select-Object @{n='Row'; e={$rows[$i]}}, @{n='Value'; e={$h} }
$i++
}
}
elseif ($NoHeader) {
$i = 0
foreach ($r in $rows) {
$i++
$r | Select-Object @{n='Row'; e={$_}}, @{n='Value'; e={"P$i"} }
}
}
else {
foreach ($r in $Rows) {
#allow "False" or "0" to be headings
$Worksheet.Cells[$r, $StartColumn] | Where-Object {-not [string]::IsNullOrEmpty($_.Value) } | Select-Object @{n='Row'; e={$r} }, Value
}
}
}
#region open file if necessary, find worksheet and ensure we have start/end row/columns
if ($Path -and -not $ExcelPackage -and $Password) {
$ExcelPackage = Open-ExcelPackage -Path $Path -Password $Password
}
elseif ($Path -and -not $ExcelPackage ) {
$ExcelPackage = Open-ExcelPackage -Path $Path
}
if (-not $ExcelPackage) {
throw 'Could not get an Excel workbook to work on' ; return
}
if (-not $WorksheetName) { $Worksheet = $ExcelPackage.Workbook.Worksheets[1] }
elseif (-not ($Worksheet = $ExcelPackage.Workbook.Worksheets[$WorkSheetName])) {
throw "Worksheet '$WorksheetName' not found, the workbook only contains the worksheets '$($ExcelPackage.Workbook.Worksheets)'. If you only wish to select the first worksheet, please remove the '-WorksheetName' parameter." ; return
}
if (-not $EndRow ) { $EndRow = $Worksheet.Dimension.End.Row }
if (-not $EndColumn) { $EndColumn = $Worksheet.Dimension.End.Column }
#endregion
$Rows = $Startrow .. $EndRow ;
$Columns = (1 + $StartColumn)..$EndColumn
if ((-not $rows) -or (-not ($PropertyNames = Get-PropertyNames -Rows $Rows -StartColumn $StartColumn))) {
throw "No headers found in left coulmn '$Startcolumn'. "; return
}
if (-not $Columns) {
Write-Warning "Worksheet '$WorksheetName' in workbook contains no data in the rows after left column '$StartColumn'"
}
else {
foreach ($c in $Columns) {
$NewColumn = [Ordered]@{ }
foreach ($p in $PropertyNames) {
$NewColumn[$p.Value] = $Worksheet.Cells[$p.row,$c].text
}
if ($AsHash) {$NewColumn}
elseif (($NewColumn.Values -ne "") -or -not $dataonly) {[PSCustomObject]$NewColumn}
}
}
}

View File

@@ -1,52 +0,0 @@
Function Import-Bycolumns {
Param(
[Parameter(Mandatory=$true)]
[OfficeOpenXml.ExcelPackage]$ExcelPackage,
[Int]$StartRow = 1,
[String]$WorksheetName,
[Int]$EndRow ,
[Int]$StartColumn = 1,
[Int]$EndColumn
)
Function Get-RowNames {
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseSingularNouns', '', Justification = "Name would be incorrect, and command is not exported")]
param(
[Parameter(Mandatory)]
[Int[]]$Rows,
[Parameter(Mandatory)]
[Int]$StartColumn
)
foreach ($R in $Rows) {
#allow "False" or "0" to be headings
$Worksheet.Cells[$R, $StartColumn] | Where-Object {-not [string]::IsNullOrEmpty($_.Value) } | Select-Object @{N = 'Row'; E = { $R } }, Value
}
}
if (-not $WorksheetName) { $Worksheet = $ExcelPackage.Workbook.Worksheets[1] }
elseif (-not ($Worksheet = $ExcelPackage.Workbook.Worksheets[$WorkSheetName])) {
throw "Worksheet '$WorksheetName' not found, the workbook only contains the worksheets '$($ExcelPackage.Workbook.Worksheets)'. If you only wish to select the first worksheet, please remove the '-WorksheetName' parameter." ; return
}
if (-not $EndRow ) { $EndRow = $Worksheet.Dimension.End.Row }
if (-not $EndColumn) { $EndColumn = $Worksheet.Dimension.End.Column }
$Rows = $Startrow .. $EndRow ;
$Columns = (1 + $StartColumn)..$EndColumn
if ((-not $rows) -or (-not ($PropertyNames = Get-RowNames -Rows $Rows -StartColumn $StartColumn))) {
throw "No headers found in left coulmn '$Startcolumn'. "; return
}
if (-not $Columns) {
Write-Warning "Worksheet '$WorksheetName' in workbook contains no data in the rows after left column '$StartColumn'"
}
else {
foreach ($c in $Columns) {
$NewColumn = [Ordered]@{ }
foreach ($p in $PropertyNames) {
$NewColumn[$p.Value] = $Worksheet.Cells[$p.row,$c].text
}
[PSCustomObject]$NewColumn
}
}
}

View File

@@ -9,36 +9,75 @@ function Get-HtmlTable {
[int]$FirstDataRow=0,
[Switch]$UseDefaultCredentials
)
if ($PSVersionTable.PSVersion.Major -gt 5 -and -not (Get-Command ConvertFrom-Html -ErrorAction SilentlyContinue)) {
# Invoke-WebRequest on .NET core doesn't have ParsedHtml so we need HtmlAgilityPack or similiar Justin Grote's PowerHTML wraps that nicely
throw "This version of PowerShell needs the PowerHTML module to process HTML Tables."
}
$r = Invoke-WebRequest $Url -UseDefaultCredentials: $UseDefaultCredentials
$propertyNames = $Header
$table = $r.ParsedHtml.getElementsByTagName("table")[$TableIndex]
$propertyNames=$Header
$totalRows=@($table.rows).count
if ($PSVersionTable.PSVersion.Major -le 5) {
$table = $r.ParsedHtml.getElementsByTagName("table")[$TableIndex]
$totalRows=@($table.rows).count
for ($idx = $FirstDataRow; $idx -lt $totalRows; $idx++) {
for ($idx = $FirstDataRow; $idx -lt $totalRows; $idx++) {
$row = $table.rows[$idx]
$cells = @($row.cells)
$row = $table.rows[$idx]
$cells = @($row.cells)
if(!$propertyNames) {
if($cells[0].tagName -eq 'th') {
$propertyNames = @($cells | ForEach-Object {$_.innertext -replace ' ',''})
} else {
$propertyNames = @(1..($cells.Count + 2) | Foreach-Object { "P$_" })
if(!$propertyNames) {
if($cells[0].tagName -eq 'th') {
$propertyNames = @($cells | ForEach-Object {$_.innertext -replace ' ',''})
} else {
$propertyNames = @(1..($cells.Count + 2) | Foreach-Object { "P$_" })
}
continue
}
continue
$result = [ordered]@{}
for($counter = 0; $counter -lt $cells.Count; $counter++) {
$propertyName = $propertyNames[$counter]
if(!$propertyName) { $propertyName= '[missing]'}
$result.$propertyName= $cells[$counter].InnerText
}
[PSCustomObject]$result
}
$result = [ordered]@{}
for($counter = 0; $counter -lt $cells.Count; $counter++) {
$propertyName = $propertyNames[$counter]
if(!$propertyName) { $propertyName= '[missing]'}
$result.$propertyName= $cells[$counter].InnerText
}
else {
$h = ConvertFrom-Html -Content $r.Content
if ($TableIndex -is [valuetype]) { $TableIndex += 1}
$rows = $h.SelectNodes("//table[$TableIndex]//tr")
if (-not $rows) {Write-Warning "Could not find rows for `"//table[$TableIndex]`" in $Url ."}
if ( -not $propertyNames) {
if ( $tableHeaders = $rows[$FirstDataRow].SelectNodes("th")) {
$propertyNames = $tableHeaders.foreach({[System.Web.HttpUtility]::HtmlDecode( $_.innerText ) -replace '\W+','_' -replace '(\w)_+$','$1' })
$FirstDataRow += 1
}
else {
$c = 0
$propertyNames = $rows[$FirstDataRow].SelectNodes("td") | Foreach-Object { "P$c" ; $c ++ }
}
}
Write-Verbose ("Property names: " + ($propertyNames -join ","))
foreach ($n in $FirstDataRow..($rows.Count-1)) {
$r = $rows[$n].SelectNodes("td|th")
if ($r -and $r.innerText -ne "" -and $r.count -gt $rows[$n].SelectNodes("th").count ) {
$c = 0
$newObj = [ordered]@{}
foreach ($p in $propertyNames) {
$n = $null
#Join descentandts for cases where the text in the cell is split (e.g with a <BR> ). We also want to remove HTML codes, trim and convert unicode minus sign to "-"
$cellText = $r[$c].Descendants().where({$_.NodeType -eq "Text"}).foreach({[System.Web.HttpUtility]::HtmlDecode( $_.innerText ).Trim()}) -Join " " -replace "\u2212","-"
if ([double]::TryParse($cellText, [ref]$n)) {$newObj[$p] = $n }
else {$newObj[$p] = $cellText }
$c ++
}
[pscustomObject]$newObj
}
}
[PSCustomObject]$result
}
}

View File

@@ -3,13 +3,13 @@ function Import-Html {
[CmdletBinding()]
param(
$Url,
$Index,
[int]$Index = 0,
$Header,
[int]$FirstDataRow=0,
[int]$FirstDataRow = 0,
[Switch]$UseDefaultCredentials
)
$xlFile = [System.IO.Path]::GetTempFileName() -replace "tmp","xlsx"
$xlFile = [System.IO.Path]::GetTempFileName() -replace "tmp", "xlsx"
Remove-Item $xlFile -ErrorAction Ignore
Write-Verbose "Exporting to Excel file $($xlFile)"

View File

@@ -1,5 +1,12 @@
# 7.6.0
- Fix -StartRow and -StartColumn being ignored.
- **_[Under investigation]_** Fix -StartRow and -StartColumn being ignored.
- James O'Neill:
- Update Get-HtmlTable to support to use PowerHTML (maintained by [Justin Grote](https://twitter.com/**JustinWGrote**)).
- Added example to including a new function Import-ByColumn. Works like Import-Excel but with data in columns instead of the conventional rows.
- Update Import-HTML with better defaults
- Fixed example `Get-ModuleStats.ps1` which reads the PowerShell Gallery page and extracts the stats table
# v7.5.2
- Changed the switch `-NotAsDictionary` to `-Raw`. Works with `-Worksheetname *` reads all the sheets in the xlsx file and returns an array.