diff --git a/Examples/Extra/Get-ModuleStats.ps1 b/Examples/Extra/Get-ModuleStats.ps1 index e433d40..267ea50 100644 --- a/Examples/Extra/Get-ModuleStats.ps1 +++ b/Examples/Extra/Get-ModuleStats.ps1 @@ -15,6 +15,6 @@ param( $galleryUrl = "https://www.powershellgallery.com/packages/$moduleName" $nolegend = '-nolegend' if($chartType -eq 'pie') {$nolegend = $null} -$code = "$($chartType)Chart (Get-HtmlTable $galleryUrl 0 | sort lastupdated -desc) -title 'Download stats for $moduleName' $nolegend" +$code = "$($chartType)Chart (Get-HtmlTable $galleryUrl -FirstDataRow 1 | sort lastupdated -desc) -title 'Download stats for $moduleName' $nolegend" $code | Invoke-Expression \ No newline at end of file diff --git a/Examples/ImportByColumns/FruitCity.xlsx b/Examples/ImportByColumns/FruitCity.xlsx new file mode 100644 index 0000000..cb05fa1 Binary files /dev/null and b/Examples/ImportByColumns/FruitCity.xlsx differ diff --git a/Examples/ImportByColumns/VM_Build_Example.xlsx b/Examples/ImportByColumns/VM_Build_Example.xlsx new file mode 100644 index 0000000..33ae355 Binary files /dev/null and b/Examples/ImportByColumns/VM_Build_Example.xlsx differ diff --git a/Examples/ImportByColumns/import-by-columns.ps1 b/Examples/ImportByColumns/import-by-columns.ps1 new file mode 100644 index 0000000..e9bf1cc --- /dev/null +++ b/Examples/ImportByColumns/import-by-columns.ps1 @@ -0,0 +1,146 @@ +function Import-ByColumns { +<# + .synopsis + Works like Import-Excel but with data in columns instead of the conventional rows. + .Description. + Import-excel will read the sample file in this folder like this + > Import-excel FruitCity.xlsx | ft * + GroupAs Apple Orange Banana + ------- ----- ------ ------ + London 1 4 9 + Paris 2 4 10 + NewYork 6 5 11 + Munich 7 8 12 + Import-ByColumns transposes it + > Import-Bycolumns FruitCity.xlsx | ft * + GroupAs London Paris NewYork Munich + ------- ------ ----- ------- ------ + Apple 1 2 6 7 + Orange 4 4 5 8 + Banana 9 10 11 12 + .Example + C:\> Import-Bycolumns -path .\VM_Build_Example.xlsx -StartRow 7 -EndRow 21 -EndColumn 7 -HeaderName Desc,size,type, + cpu,ram,NetAcc,OS,OSDiskSize,DataDiskSize,LogDiskSize,TempDbDiskSize,BackupDiskSize,ImageDiskDize,AzureBackup,AzureReplication | ft -a * + + This reads a spreadsheet which has a block from row 7 to 21 containing 14 properties of virtual machines. + The properties names are in column A and the 6 VMS are in columns B-G + Because the property names are written for easy reading by the person completing the spreadsheet, they are replaced with new names. + All the parameters work as they would for Import-Excel +#> + + [Diagnostics.CodeAnalysis.SuppressMessageAttribute("PSAvoidUsingPlainTextForPassword", "")] + param( + [Alias('FullName')] + [Parameter(ParameterSetName = "PathA", Mandatory, ValueFromPipelineByPropertyName, ValueFromPipeline, Position = 0 )] + [Parameter(ParameterSetName = "PathB", Mandatory, ValueFromPipelineByPropertyName, ValueFromPipeline, Position = 0 )] + [Parameter(ParameterSetName = "PathC", Mandatory, ValueFromPipelineByPropertyName, ValueFromPipeline, Position = 0 )] + [String]$Path, + + [Parameter(ParameterSetName = "PackageA", Mandatory)] + [Parameter(ParameterSetName = "PackageB", Mandatory)] + [Parameter(ParameterSetName = "PackageC", Mandatory)] + [OfficeOpenXml.ExcelPackage]$ExcelPackage, + + [Alias('Sheet')] + [Parameter(Position = 1)] + [ValidateNotNullOrEmpty()] + [String]$WorksheetName, + + [Parameter(ParameterSetName = 'PathB' , Mandatory)] + [Parameter(ParameterSetName = 'PackageB', Mandatory)] + [String[]]$HeaderName , + [Parameter(ParameterSetName = 'PathC' , Mandatory)] + [Parameter(ParameterSetName = 'PackageC', Mandatory)] + [Switch]$NoHeader, + + [Alias('TopRow')] + [ValidateRange(1, 9999)] + [Int]$StartRow = 1, + + [Alias('StopRow', 'BottomRow')] + [Int]$EndRow , + + [Alias('LeftColumn','LabelColumn')] + [Int]$StartColumn = 1, + + [Int]$EndColumn, + [switch]$DataOnly, + [switch]$AsHash, + + [ValidateNotNullOrEmpty()] + [String]$Password + ) + function Get-PropertyNames { + <# + .SYNOPSIS + Create objects containing the row number and the row name for each of the different header types. + #> + [Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseSingularNouns', '', Justification = "Name would be incorrect, and command is not exported")] + param( + [Parameter(Mandatory)] + [Int[]]$Rows, + [Parameter(Mandatory)] + [Int]$StartColumn + ) + if ($HeaderName) { + $i = 0 + foreach ($h in $HeaderName) { + $h | Select-Object @{n='Row'; e={$rows[$i]}}, @{n='Value'; e={$h} } + $i++ + } + } + elseif ($NoHeader) { + $i = 0 + foreach ($r in $rows) { + $i++ + $r | Select-Object @{n='Row'; e={$_}}, @{n='Value'; e={"P$i"} } + } + } + else { + foreach ($r in $Rows) { + #allow "False" or "0" to be headings + $Worksheet.Cells[$r, $StartColumn] | Where-Object {-not [string]::IsNullOrEmpty($_.Value) } | Select-Object @{n='Row'; e={$r} }, Value + } + } + } + +#region open file if necessary, find worksheet and ensure we have start/end row/columns + if ($Path -and -not $ExcelPackage -and $Password) { + $ExcelPackage = Open-ExcelPackage -Path $Path -Password $Password + } + elseif ($Path -and -not $ExcelPackage ) { + $ExcelPackage = Open-ExcelPackage -Path $Path + } + if (-not $ExcelPackage) { + throw 'Could not get an Excel workbook to work on' ; return + } + + if (-not $WorksheetName) { $Worksheet = $ExcelPackage.Workbook.Worksheets[1] } + elseif (-not ($Worksheet = $ExcelPackage.Workbook.Worksheets[$WorkSheetName])) { + throw "Worksheet '$WorksheetName' not found, the workbook only contains the worksheets '$($ExcelPackage.Workbook.Worksheets)'. If you only wish to select the first worksheet, please remove the '-WorksheetName' parameter." ; return + } + + if (-not $EndRow ) { $EndRow = $Worksheet.Dimension.End.Row } + if (-not $EndColumn) { $EndColumn = $Worksheet.Dimension.End.Column } +#endregion + + $Rows = $Startrow .. $EndRow ; + $Columns = (1 + $StartColumn)..$EndColumn + + if ((-not $rows) -or (-not ($PropertyNames = Get-PropertyNames -Rows $Rows -StartColumn $StartColumn))) { + throw "No headers found in left coulmn '$Startcolumn'. "; return + } + if (-not $Columns) { + Write-Warning "Worksheet '$WorksheetName' in workbook contains no data in the rows after left column '$StartColumn'" + } + else { + foreach ($c in $Columns) { + $NewColumn = [Ordered]@{ } + foreach ($p in $PropertyNames) { + $NewColumn[$p.Value] = $Worksheet.Cells[$p.row,$c].text + } + if ($AsHash) {$NewColumn} + elseif (($NewColumn.Values -ne "") -or -not $dataonly) {[PSCustomObject]$NewColumn} + } + } +} diff --git a/Examples/import-by-columns.ps1 b/Examples/import-by-columns.ps1 deleted file mode 100644 index e70015d..0000000 --- a/Examples/import-by-columns.ps1 +++ /dev/null @@ -1,52 +0,0 @@ - -Function Import-Bycolumns { - Param( - [Parameter(Mandatory=$true)] - [OfficeOpenXml.ExcelPackage]$ExcelPackage, - [Int]$StartRow = 1, - [String]$WorksheetName, - [Int]$EndRow , - [Int]$StartColumn = 1, - [Int]$EndColumn - ) - Function Get-RowNames { - [Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseSingularNouns', '', Justification = "Name would be incorrect, and command is not exported")] - param( - [Parameter(Mandatory)] - [Int[]]$Rows, - [Parameter(Mandatory)] - [Int]$StartColumn - ) - foreach ($R in $Rows) { - #allow "False" or "0" to be headings - $Worksheet.Cells[$R, $StartColumn] | Where-Object {-not [string]::IsNullOrEmpty($_.Value) } | Select-Object @{N = 'Row'; E = { $R } }, Value - } - } - - if (-not $WorksheetName) { $Worksheet = $ExcelPackage.Workbook.Worksheets[1] } - elseif (-not ($Worksheet = $ExcelPackage.Workbook.Worksheets[$WorkSheetName])) { - throw "Worksheet '$WorksheetName' not found, the workbook only contains the worksheets '$($ExcelPackage.Workbook.Worksheets)'. If you only wish to select the first worksheet, please remove the '-WorksheetName' parameter." ; return - } - - if (-not $EndRow ) { $EndRow = $Worksheet.Dimension.End.Row } - if (-not $EndColumn) { $EndColumn = $Worksheet.Dimension.End.Column } - - $Rows = $Startrow .. $EndRow ; - $Columns = (1 + $StartColumn)..$EndColumn - - if ((-not $rows) -or (-not ($PropertyNames = Get-RowNames -Rows $Rows -StartColumn $StartColumn))) { - throw "No headers found in left coulmn '$Startcolumn'. "; return - } - if (-not $Columns) { - Write-Warning "Worksheet '$WorksheetName' in workbook contains no data in the rows after left column '$StartColumn'" - } - else { - foreach ($c in $Columns) { - $NewColumn = [Ordered]@{ } - foreach ($p in $PropertyNames) { - $NewColumn[$p.Value] = $Worksheet.Cells[$p.row,$c].text - } - [PSCustomObject]$NewColumn - } - } -} diff --git a/Public/Get-HtmlTable.ps1 b/Public/Get-HtmlTable.ps1 index b613a98..4851d4f 100644 --- a/Public/Get-HtmlTable.ps1 +++ b/Public/Get-HtmlTable.ps1 @@ -9,36 +9,75 @@ function Get-HtmlTable { [int]$FirstDataRow=0, [Switch]$UseDefaultCredentials ) + if ($PSVersionTable.PSVersion.Major -gt 5 -and -not (Get-Command ConvertFrom-Html -ErrorAction SilentlyContinue)) { + # Invoke-WebRequest on .NET core doesn't have ParsedHtml so we need HtmlAgilityPack or similiar Justin Grote's PowerHTML wraps that nicely + throw "This version of PowerShell needs the PowerHTML module to process HTML Tables." + } $r = Invoke-WebRequest $Url -UseDefaultCredentials: $UseDefaultCredentials + $propertyNames = $Header - $table = $r.ParsedHtml.getElementsByTagName("table")[$TableIndex] - $propertyNames=$Header - $totalRows=@($table.rows).count + if ($PSVersionTable.PSVersion.Major -le 5) { + $table = $r.ParsedHtml.getElementsByTagName("table")[$TableIndex] + $totalRows=@($table.rows).count - for ($idx = $FirstDataRow; $idx -lt $totalRows; $idx++) { + for ($idx = $FirstDataRow; $idx -lt $totalRows; $idx++) { - $row = $table.rows[$idx] - $cells = @($row.cells) + $row = $table.rows[$idx] + $cells = @($row.cells) - if(!$propertyNames) { - if($cells[0].tagName -eq 'th') { - $propertyNames = @($cells | ForEach-Object {$_.innertext -replace ' ',''}) - } else { - $propertyNames = @(1..($cells.Count + 2) | Foreach-Object { "P$_" }) + if(!$propertyNames) { + if($cells[0].tagName -eq 'th') { + $propertyNames = @($cells | ForEach-Object {$_.innertext -replace ' ',''}) + } else { + $propertyNames = @(1..($cells.Count + 2) | Foreach-Object { "P$_" }) + } + continue } - continue + + $result = [ordered]@{} + + for($counter = 0; $counter -lt $cells.Count; $counter++) { + $propertyName = $propertyNames[$counter] + + if(!$propertyName) { $propertyName= '[missing]'} + $result.$propertyName= $cells[$counter].InnerText + } + + [PSCustomObject]$result } - - $result = [ordered]@{} - - for($counter = 0; $counter -lt $cells.Count; $counter++) { - $propertyName = $propertyNames[$counter] - - if(!$propertyName) { $propertyName= '[missing]'} - $result.$propertyName= $cells[$counter].InnerText + } + else { + $h = ConvertFrom-Html -Content $r.Content + if ($TableIndex -is [valuetype]) { $TableIndex += 1} + $rows = $h.SelectNodes("//table[$TableIndex]//tr") + if (-not $rows) {Write-Warning "Could not find rows for `"//table[$TableIndex]`" in $Url ."} + if ( -not $propertyNames) { + if ( $tableHeaders = $rows[$FirstDataRow].SelectNodes("th")) { + $propertyNames = $tableHeaders.foreach({[System.Web.HttpUtility]::HtmlDecode( $_.innerText ) -replace '\W+','_' -replace '(\w)_+$','$1' }) + $FirstDataRow += 1 + } + else { + $c = 0 + $propertyNames = $rows[$FirstDataRow].SelectNodes("td") | Foreach-Object { "P$c" ; $c ++ } + } + } + Write-Verbose ("Property names: " + ($propertyNames -join ",")) + foreach ($n in $FirstDataRow..($rows.Count-1)) { + $r = $rows[$n].SelectNodes("td|th") + if ($r -and $r.innerText -ne "" -and $r.count -gt $rows[$n].SelectNodes("th").count ) { + $c = 0 + $newObj = [ordered]@{} + foreach ($p in $propertyNames) { + $n = $null + #Join descentandts for cases where the text in the cell is split (e.g with a
). We also want to remove HTML codes, trim and convert unicode minus sign to "-" + $cellText = $r[$c].Descendants().where({$_.NodeType -eq "Text"}).foreach({[System.Web.HttpUtility]::HtmlDecode( $_.innerText ).Trim()}) -Join " " -replace "\u2212","-" + if ([double]::TryParse($cellText, [ref]$n)) {$newObj[$p] = $n } + else {$newObj[$p] = $cellText } + $c ++ + } + [pscustomObject]$newObj + } } - - [PSCustomObject]$result } } diff --git a/Public/Import-Html.ps1 b/Public/Import-Html.ps1 index 5c0b020..6a22df4 100644 --- a/Public/Import-Html.ps1 +++ b/Public/Import-Html.ps1 @@ -3,13 +3,13 @@ function Import-Html { [CmdletBinding()] param( $Url, - $Index, + [int]$Index = 0, $Header, - [int]$FirstDataRow=0, + [int]$FirstDataRow = 0, [Switch]$UseDefaultCredentials ) - $xlFile = [System.IO.Path]::GetTempFileName() -replace "tmp","xlsx" + $xlFile = [System.IO.Path]::GetTempFileName() -replace "tmp", "xlsx" Remove-Item $xlFile -ErrorAction Ignore Write-Verbose "Exporting to Excel file $($xlFile)" diff --git a/changelog.md b/changelog.md index fde3f92..64fab5a 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,12 @@ # 7.6.0 -- Fix -StartRow and -StartColumn being ignored. + +- **_[Under investigation]_** Fix -StartRow and -StartColumn being ignored. +- James O'Neill: + - Update Get-HtmlTable to support to use PowerHTML (maintained by [Justin Grote](https://twitter.com/**JustinWGrote**)). + - Added example to including a new function Import-ByColumn. Works like Import-Excel but with data in columns instead of the conventional rows. +- Update Import-HTML with better defaults +- Fixed example `Get-ModuleStats.ps1` which reads the PowerShell Gallery page and extracts the stats table + # v7.5.2 - Changed the switch `-NotAsDictionary` to `-Raw`. Works with `-Worksheetname *` reads all the sheets in the xlsx file and returns an array.