epub:html-splitter html-splitter
epubtools/modules/html-splitter/xpl/html-splitter.xpl
Import URI: ../../html-splitter/xpl/html-splitter.xpl
Sample invocation (for debugging purposes):
calabash/calabash.sh -i source=file:/$(cygpath -ma ../content/output/debug/epubtools/create-ops/pre-split.html) -i conf=file:/$(cygpath -ma adaptions/publisher/series/epubtools/heading-conf.xml) -o result=tmp.html -o report=report.xml -o files=files.xml file:/$(cygpath -ma epubtools/modules/html-splitter/xpl/html-splitter.xpl) base-uri=file:/$(cygpath -ma ../content/output/debug/epubtools/create-ops/pre-split.html) debug=yes debug-dir-uri=file:/$(cygpath -ma ../content/output/debug)
Calabash seems to suppress some XSLT errors, for instance when a stylesheet is looping. Therefore it might be necessary to replace collection()[…] with document(…) in the XSL (alternative variable declarations are already included in the xsl file, commented out) and run saxon from the command line, for example like this:
PRE_SPLIT=file:/$(cygpath -ma ../content/le-tex/whitepaper/de/output/output/debug/epubtools/create-ops/pre-split.html) saxon -xsl:epubtools/modules/html-splitter/xsl/html-splitter.xsl -s:$PRE_SPLIT -it:main \ debug-dir-uri=file:/$(cygpath -ma debug) \ debug=yes \ final-pub-type=EPUB2 \ heading-conf-uri=file:/$(cygpath -ma adaptions/common/epubtools/heading-conf.xml) \ meta-uri=file:/$(cygpath -ma ../content/le-tex/whitepaper/de/output/output/debug/epubtools/epub-config.xml) \ datadir=file:/$(cygpath -ma debug/datadir)
Input Ports
Name | Documentation | Connections |
---|---|---|
sourceⓅ | ||
confⓈ | /hierarchy – may be included in /epub-config | |
meta | /epub-config | |
css-xml | XML representation of the parsed CSS |
Output Ports
Name | Documentation | Connections |
---|---|---|
resultⓅ | ||
files | ||
report | ||
unused-css-resourcesⓈ | ||
splitting-reportⓈ |
Options
Name | Documentation | Default |
---|---|---|
base-uriⓇ | ||
target | 'EPUB2' | |
debug | 'no' | |
debug-dir-uri | 'debug' |
Subpipeline
Step | Inputs | Outputs | Options | |||||
---|---|---|---|---|---|---|---|---|
p:variable css-handling | (/epub-config/@css-handling, 'regenerated-per-split')[1] | |||||||
p:variable html-subdir-name | (/epub-config/@html-subdir-name, '')[1] | |||||||
p:identity strip-leading-non-elements Strip spurious text-only document nodes that sometimes occured before the HTML document. |
| result | ||||||
p:try html-splitter-group | You might need to comment out this p:try/p:catch and move name="html-splitter-group" to the followin p:group in order to facilitate debugging if there is an error in the splitter XSLT. In extreme cases, it might be necessary to invoke the XSLT directly. For instructions, see the comments after the xsl:param instructions in html-splitter.xsl. | |||||||
p:group d256e84 | ||||||||
p:variable workdir | replace($base-uri, '^(.*[/])+(.*)', '$1') | |||||||
p:variable basename | replace($base-uri, '^(.*[/])+(.*?)(\.[\w.]+)$', '$2') | |||||||
p:variable indent | (/epub-config/@indent, 'true')[1] | |||||||
tr:store-debug d256e130 |
| result | pipeline-step = concat('epubtools/html-splitter/', $basename, '/splitter-input') active = $debug base-uri = $debug-dir-uri | |||||
p:xslt split |
| result | template-name = 'main' | |||||
tr:store-debug d256e171 | result | pipeline-step = concat('epubtools/html-splitter/', $basename, '/chunks') active = $debug base-uri = $debug-dir-uri | ||||||
p:for-each store-debug-try | ||||||||
p:store d256e188 |
| result | href = base-uri() | |||||
p:identity splitting-report | result | |||||||
p:sink d256e202 |
| |||||||
p:choose per-split-css | ||||||||
contains($css-handling, 'regenerated-per-split') | ||||||||
p:xslt per-split-css-xml-representations Primary output: the new, reduced common CSS. Secondary port: individual CSS files if applicable. Also on secondary port: A file named 'unused-css-resources.xml' |
| result | ||||||
p:sink d256e259 |
| |||||||
p:identity unused-css-resources |
| result | ||||||
tr:store-debug d256e270 |
| result | pipeline-step = concat('epubtools/html-splitter/', $basename, '/unused-css-resources') active = $debug base-uri = $debug-dir-uri | |||||
p:sink d256e279 | ||||||||
p:identity individual-css-representations |
| result | ||||||
p:sink d256e289 |
| |||||||
p:xslt insert-individual-css-link Will insert links for per-split css. Has side effect: svg namespace fixup. |
| result | template-name = 'main' | |||||
p:for-each d256e319 | ||||||||
tr:store-debug d256e328 | result | pipeline-step = concat('epubtools/html-splitter/', $basename, '/individual-css-links/', replace(base-uri(), '^.+/', '')) active = $debug base-uri = $debug-dir-uri | ||||||
p:for-each generate-css | ||||||||
css:generate gen |
| result | prepend-resource-path = '../' strip-comments = if (contains($css-handling, 'remove-comments')) then 'true' else 'false' | |||||
tr:store-debug d256e355 | result | pipeline-step = concat('epubtools/html-splitter/', $basename, '/per-split-css/', replace(base-uri(), '^.+/', '')) active = $debug base-uri = $debug-dir-uri | ||||||
p:sink d256e365 |
| |||||||
$css-handling = 'unchanged' | ||||||||
p:identity d256e377 | result | |||||||
p:otherwise | regenerated | |||||||
css:generate gen |
| result | prepend-resource-path = '../' | |||||
p:sink d256e406 | ||||||||
p:identity d256e409 | result | |||||||
p:for-each store-chunks | ||||||||
p:variable chunk-file-uri base-uri(/*) instead of base-uri() because we set the base uri of the primary CSS by adding an xml:base attribute. | replace(base-uri(/*), 'chunks/', 'epub/OEBPS/') | |||||||
p:choose d256e446 | ||||||||
$target = 'EPUB3' and matches(base-uri(), 'nav\.xhtml$') and (normalize-space($html-subdir-name)) | Brute force link correction for the generated landmarks nav that will be stored to OEBPS even when the remainder of the HTML is stored to a subdir. | |||||||
p:viewport d256e453 | ||||||||
p:otherwise | ||||||||
p:identity d256e464 |
| result | ||||||
p:identity postprocessing |
| result | ||||||
p:delete d256e471 |
| result | match = '@srcpath | @source-dir-uri' | |||||
p:choose d256e473 | ||||||||
matches($chunk-file-uri, '\.ncx$' ) | ||||||||
p:store store-chunk | result | include-content-type = 'true' omit-xml-declaration = 'false' indent = 'true' href = $chunk-file-uri doctype-public = if($target eq 'EPUB3') then '' else '-//NISO//DTD ncx 2005-1//EN' doctype-system = if($target eq 'EPUB3') then '' else 'http://www.daisy.org/z3986/2005/ncx-2005-1.dtd' | ||||||
matches($chunk-file-uri, '\.(txt|css)$') | ||||||||
p:store d256e493 | result | method = 'text' encoding = 'UTF-8' href = $chunk-file-uri | ||||||
$target = 'EPUB3' and matches(base-uri(), 'nav\.xhtml$') and (normalize-space($html-subdir-name)) | ||||||||
p:store store-chunk | result | include-content-type = 'false' omit-xml-declaration = 'false' method = 'xhtml' indent = if ($indent = 'true') then 'true' else 'false' href = $chunk-file-uri | ||||||
$target eq 'EPUB3' | ||||||||
p:delete d256e514 | result | match = 'html:meta[@name = 'sequence']' | ||||||
p:store store-chunk | result | include-content-type = 'false' omit-xml-declaration = 'false' method = 'xhtml' indent = if ($indent = 'true') then 'true' else 'false' href = $chunk-file-uri | ||||||
$target = ('EPUB2', 'KF8') and matches(base-uri(), 'nav\.xhtml$') | drop nav.xhtml for EPUB2 | |||||||
p:sink d256e529 | ||||||||
p:otherwise | ||||||||
p:delete d256e536 | result | match = 'html:meta[@name = 'sequence'] | @epub:type | html:nav[@epub:type = 'landmarks']' | ||||||
p:rename d256e538 | result | match = 'html:nav' new-name = 'div' new-namespace = 'http://www.w3.org/1999/xhtml' | ||||||
p:store store-chunk | result | include-content-type = 'true' omit-xml-declaration = 'false' method = 'xhtml' doctype-public = '-//W3C//DTD XHTML 1.1//EN' doctype-system = 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' indent = if ($indent = 'true') then 'true' else 'false' href = $chunk-file-uri | ||||||
p:xslt collect-file-uri |
| result | ||||||
p:sink d256e564 |
| |||||||
p:add-attribute patch-xml-base |
| result | attribute-name = 'xml:base' match = '/*' attribute-value = $chunk-file-uri | |||||
p:sink d256e578 |
| |||||||
p:for-each signal-splitting-error | The presence of an orig.txt is an indicator that the split text differs from the original text. We’ll raise an error. We don’t do it immediately within the split step because we want to store the results first so that you can do forensics. | |||||||
p:store store-orig-txt |
| result | method = 'text' href = base-uri() | |||||
p:identity d256e595 | result | |||||||
p:store store-chunks-txt | result | method = 'text' href = base-uri() | ||||||
p:for-each store-debug | ||||||||
p:store d256e616 |
| result | href = base-uri() | |||||
p:add-attribute orig-txt-url |
| result | match = '/html:p/html:a[1]' attribute-name = 'href' attribute-value = base-uri() | |||||
p:add-attribute chunks-txt-url |
| result | match = '/html:p/html:a[2]' attribute-name = 'href' attribute-value = replace(base-uri(), 'orig\.txt$', 'chunks.txt') | |||||
p:error splitting-error |
| result | code = 'epub:SPLT01' | |||||
p:wrap-sequence d256e659 |
| result | wrapper = 'document' wrapper-namespace = 'http://xmlcalabash.com/ns/extensions' wrapper-prefix = 'cx' | |||||
p:add-attribute wrap-chunks | result | match = '/*' attribute-name = 'name' attribute-value = 'wrap-chunks' | ||||||
p:wrap-sequence d256e669 |
| result | wrapper = 'document' wrapper-namespace = 'http://xmlcalabash.com/ns/extensions' wrapper-prefix = 'cx' | |||||
tr:store-debug wrap-chunk-uris | result | pipeline-step = concat('epubtools/html-splitter/', $basename, '/result') active = $debug base-uri = $debug-dir-uri | ||||||
p:catch split-failed | error | |||||||
p:variable basename | replace($base-uri, '^(.*[/])+(.*?)(\.[\w.]+)$', '$2') | |||||||
tr:propagate-caught-error propagate |
| result | msg-file = 'splitter-error.txt' code = 'epub:SPLT01' status-dir-uri = concat($debug-dir-uri, '/status') | |||||
tr:store-debug store-error-message | result | pipeline-step = concat('epubtools/html-splitter/', $basename, '/ERROR_split') active = $debug base-uri = $debug-dir-uri | ||||||
cx:message d256e739 |
| result | message = '[ERROR] split failed with error message: ', . | |||||
p:identity errors | result | |||||||
p:sink d256e746 | ||||||||
p:add-attribute d256e750 |
| result | match = '/*' attribute-name = 'tr:step-name' attribute-value = 'html-splitter' | |||||
p:add-attribute report | result | match = '/*' attribute-name = 'tr:rule-family' attribute-value = 'html-splitter' |