|
The following example shows one method of crawling and
transferring an entire site to PDF. Here we use JavaScript to
determine the links present on the page. However you could equally
well use the HtmlCallback to do the
same thing.
[C#]
Doc theDoc = new Doc();
string theURL = "http://www.fbi.gov/";
// Set HTML options
theDoc.HtmlOptions.AddLinks = true;
theDoc.HtmlOptions.UseScript = true;
theDoc.HtmlOptions.PageCacheEnabled = false;
// JavaScript is used to extract all links from the page
theDoc.HtmlOptions.OnLoadScript = "var hrefCollection =
document.all.tags(\"a\");" +
"var allLinks = \"\";" +
"for(i = 0; i < hrefCollection.length; ++i) {" +
"if (i > 0)" +
" allLinks += \",\";" +
"allLinks += hrefCollection.item(i).href;" +
"};"+
"document.documentElement.abcpdf = allLinks;";
// Array of links - start with base URL
ArrayList links = new ArrayList();
links.Add(theURL);
for (int i = 0; i < links.Count; i++) {
// Stop if we render more than 20 pages
if (theDoc.PageCount > 20)
break;
// Add page
theDoc.Page = theDoc.AddPage();
int theID = theDoc.AddImageUrl(links[i] as string);
// Links from the rendered page
string allLinks =
theDoc.HtmlOptions.GetScriptReturn(theID);
string[] newLinks = allLinks.Split(new char[]
{','});
foreach (string link in newLinks) {
// Check to see if we allready rendered
this page
if (links.BinarySearch(link) < 0) {
// Skip links inside the
page
int pos =
link.IndexOf("#");
if (! ( pos > 0 &&
links.BinarySearch(link.Substring(0, pos)) >= 0)) {
if
(link.StartsWith(theURL)) {
links.Add(link);
}
}
}
}
// Add other pages
while (true) {
theDoc.FrameRect();
if (!theDoc.Chainable(theID))
break;
theDoc.Page = theDoc.AddPage();
theID = theDoc.AddImageToChain(theID);
}
}
// Link pages together
theDoc.HtmlOptions.LinkPages();
// Flatten all pages
for (int i = 1; i <= theDoc.PageCount; i++) {
theDoc.PageNumber = i;
theDoc.Flatten();
}
// Save the document
theDoc.Save(Server.MapPath("HtmlOptionsJavaScript.pdf"));
theDoc.Clear();
[Visual Basic]
Dim theDoc As Doc = New Doc()
Dim theURL As String = "http://www.fbi.gov/"
' Set HTML options
theDoc.HtmlOptions.AddLinks = True
theDoc.HtmlOptions.UseScript = True
theDoc.HtmlOptions.PageCacheEnabled = False
' JavaScript is used to extract all links from the page
theDoc.HtmlOptions.OnLoadScript = "var hrefCollection =
document.all.tags(""a"");" + _
"var allLinks = """";" + _
"for(i = 0; i < hrefCollection.length; ++i) {" +
_
"if (i > 0)" + _
" allLinks += "","";" + _
"allLinks += hrefCollection.item(i).href;" + _
"};" + _
"document.documentElement.abcpdf = allLinks;"
' Array of links - start with base URL
Dim links As ArrayList = New ArrayList()
links.Add(theURL)
Dim i, theID As Integer
For i = 0 To links.Count - 1
' Stop if we render more than 20 pages
If theDoc.PageCount > 20 Then Exit For
' Add page
theDoc.Page = theDoc.AddPage()
theID = theDoc.AddImageUrl(links(i))
' Links from the rendered page
Dim allLinks As String
allLinks =
theDoc.HtmlOptions.GetScriptReturn(theID)
Dim newLinks() As String
newLinks = allLinks.Split(New Char() {","})
Dim link As String
For Each link in newLinks
' Check to see if we allready rendered this
page
If links.BinarySearch(link) < 0 Then
' Skip links inside the
page
Dim pos As Integer
pos = link.IndexOf("#")
If Not ( pos > 0 And
links.BinarySearch(link.Substring(0, pos)) >= 0) Then
If
link.StartsWith(theURL) Then links.Add(link)
End If
End If
Next
' Add other pages
Do
theDoc.FrameRect()
If Not theDoc.Chainable(theID) Then Exit
Do
theDoc.Page = theDoc.AddPage()
theID = theDoc.AddImageToChain(theID)
Loop
Next
' Link pages together
theDoc.HtmlOptions.LinkPages()
' Flatten all pages
For i = 1 To theDoc.PageCount
theDoc.PageNumber = i
theDoc.Flatten()
Next
' Save the document
theDoc.Save(Server.MapPath("HtmlOptionsJavaScript.pdf"))
theDoc.Clear()

HtmlOptionsJavaScript.pdf - [Page 1] |

HtmlOptionsJavaScript.pdf - [Page 2] |

HtmlOptionsJavaScript.pdf - [Page 3] |

HtmlOptionsJavaScript.pdf - [Page 4] |
|
|
|