Skip to content

Commit 7a28ff3

Browse files
committed
fixup! fixup! fixup! New implementation of TEI to Markdown, and Markdown to TEI conversions using XSLT
1 parent 6347f31 commit 7a28ff3

3 files changed

Lines changed: 101 additions & 62 deletions

File tree

src/main/xar-resources/services/manuforma-form-to-tei.xslt

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,6 @@
4444
</xsl:otherwise>
4545
</xsl:choose>
4646
</xsl:for-each>
47-
48-
<!-- xsl:apply-templates select="node()" mode="markdown-to-tei"/ -->
49-
5047
</xsl:copy>
5148
</xsl:template>
5249

@@ -56,15 +53,6 @@
5653
<xsl:with-param name="source" select="m2t:extract-source(@source)"/>
5754
</xsl:call-template>
5855
</xsl:template>
59-
60-
<!--
61-
<xsl:template match="tei:ab">
62-
<xsl:copy>
63-
<xsl:apply-templates select="@*"/>
64-
<xsl:apply-templates select="node()" mode="markdown-to-tei"/>
65-
</xsl:copy>
66-
</xsl:template>
67-
-->
6856

6957
<!-- Add a Contributor into the titleStmt Element -->
7058
<xsl:template match="tei:titleStmt">
@@ -207,7 +195,6 @@
207195
</xsl:copy>
208196
</xsl:template>
209197

210-
211198
<xsl:function name="m2t:extract-source" as="xs:string">
212199
<xsl:param name="input" as="xs:string" required="yes"/>
213200
<xsl:sequence select="concat('#', tokenize($input, '#')[last()])"/>

src/main/xar-resources/services/markdown-to-tei.xslt

Lines changed: 93 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -21,89 +21,137 @@
2121

2222
<!-- Replace heading 1 Markdown with tei:hi h1 Element -->
2323
<xsl:template match="text()[matches(., '(^|[^#])#\s+.+')]" mode="markdown-to-tei">
24-
<xsl:apply-templates select="m2t:h1(.)" mode="#current"/>
24+
<xsl:sequence select="m2t:h1(.)"/>
2525
</xsl:template>
2626

2727
<!-- Replace heading 2 Markdown with tei:hi h2 Element -->
2828
<xsl:template match="text()[matches(., '(^|[^#])##\s+.+')]" mode="markdown-to-tei">
29-
<xsl:apply-templates select="m2t:h2(.)" mode="#current"/>
29+
<xsl:sequence select="m2t:h2(.)"/>
3030
</xsl:template>
3131

3232
<!-- Replace heading 3 Markdown with tei:hi h3 Element -->
3333
<xsl:template match="text()[matches(., '(^|[^#])###\s+.+')]" mode="markdown-to-tei">
34-
<xsl:apply-templates select="m2t:h3(.)" mode="#current"/>
34+
<xsl:sequence select="m2t:h3(.)"/>
3535
</xsl:template>
3636

3737
<!-- Replace bold Markdown with tei:emph bold Element -->
3838
<xsl:template match="text()[matches(., '\*\*([^*]+)\*\*')]" mode="markdown-to-tei">
39-
<xsl:apply-templates select="m2t:bold(.)" mode="#current"/>
39+
<xsl:sequence select="m2t:bold(.)"/>
4040
</xsl:template>
4141

4242
<!-- Replace italic Markdown with tei:emph italic Element -->
4343
<xsl:template match="text()[matches(., '(^|[^*\\])\*[^*]*[^*\\]\*([^*]|$)')]" mode="markdown-to-tei">
44-
<xsl:apply-templates select="m2t:italic(.)" mode="#current"/>
44+
<xsl:sequence select="m2t:italic(.)"/>
4545
</xsl:template>
46-
47-
<!-- TODO(AR) the rule below adds a <tei:p> wrapper, the tei:p should only be applied when the parent of the text node is an appropriate container e.g. <tei:note>
48-
Ideally we want to keep tei:note and those kind of elements out of this XSLT - they should go in manuforma-form-to-tei.xspec
49-
Figure out whether it's best to move the tei:p creation to the manuforma-form-to-tei.xspec or to send some sort of mode/flag through to here and use different templates
50-
-->
5146

47+
<!-- Process Markdown paragraphs and convert them into tei:hi and tei:p Elements -->
5248
<xsl:template match="text()" mode="markdown-to-tei-container">
5349
<!-- First we split on /n/n+ whilst ignoring misc whitespace to create headings or paragraphs -->
54-
<xsl:variable name="paragraphs" select="tokenize(., '\n[ \t\r]*\n+')[not(matches(., '^[ \t\r]+$'))][string-length(.) gt 0]"/>
50+
<xsl:variable name="paragraphs" as="xs:string*" select="tokenize(., '\n[ \t\r]*\n+')[not(matches(., '^[ \t\r]+$'))][string-length(.) gt 0]"/>
5551
<xsl:for-each select="$paragraphs">
56-
<xsl:variable name="paragraph" select="."/>
52+
<xsl:variable name="paragraph" as="xs:string" select="."/>
53+
54+
<!-- We now split on /n whilst ignoring misc whitespace to create lines from the paragraph -->
55+
<xsl:variable name="lines" as="xs:string*" select="tokenize($paragraph, '\n')[not(matches(., '^[ \t\r]+$'))][string-length(.) gt 0]"/>
56+
<xsl:variable name="first-line" as="xs:string?" select="head($lines)"/>
57+
<xsl:variable name="remaining-lines" as="xs:string*" select="tail($lines)"/>
58+
5759
<xsl:choose>
58-
<xsl:when test="matches($paragraph, '^#\s+.+')">
59-
<!-- This is a Markdown heading 1 -->
60-
<xsl:apply-templates select="m2t:h1($paragraph)" mode="#current"/>
60+
<xsl:when test="matches($first-line, '^#\s+.+')">
61+
<!-- First line is a Markdown heading 1, so process that and then the remaining lines make up the paragraph -->
62+
<xsl:sequence select="m2t:h1($first-line)"/>
63+
<xsl:sequence select="m2t:paragraph($remaining-lines)"/>
6164
</xsl:when>
6265

63-
<xsl:when test="matches($paragraph, '^##\s+.+')">
64-
<!-- This is a Markdown heading 2 -->
65-
<xsl:apply-templates select="m2t:h2($paragraph)" mode="#current"/>
66+
<xsl:when test="matches($first-line, '^##\s+.+')">
67+
<!-- First line is a Markdown heading 2, so process that and then the remaining lines make up the paragraph -->
68+
<xsl:sequence select="m2t:h2($first-line)"/>
69+
<xsl:sequence select="m2t:paragraph($remaining-lines)"/>
6670
</xsl:when>
6771

68-
<xsl:when test="matches($paragraph, '^###\s+.+')">
69-
<!-- This is a Markdown heading 3 -->
70-
<xsl:apply-templates select="m2t:h3($paragraph)" mode="#current"/>
72+
<xsl:when test="matches($first-line, '^###\s+.+')">
73+
<!-- First line is a Markdown heading 3, so process that and then the remaining lines make up the paragraph -->
74+
<xsl:sequence select="m2t:h3($first-line)"/>
75+
<xsl:sequence select="m2t:paragraph($remaining-lines)"/>
7176
</xsl:when>
7277

7378
<xsl:otherwise>
74-
<!-- This is a Markdown paragraph -->
75-
<tei:p>
76-
<!-- We now split on /n whilst ignoring misc whitespace to create lines within a paragraph -->
77-
<xsl:variable name="lines" select="tokenize($paragraph, '\n')[not(matches(., '^[ \t\r]+$'))][string-length(.) gt 0]"/>
78-
<xsl:for-each select="(1 to count($lines))">
79-
<xsl:variable name="i" select="." as="xs:integer"/>
80-
<xsl:variable name="line" select="$lines[$i]"/>
81-
<xsl:if test="$i gt 1"><tei:lb/><xsl:text>&#xA;</xsl:text></xsl:if>
82-
<xsl:sequence select="m2t:bold-and-italic($line)"/>
83-
</xsl:for-each>
84-
</tei:p>
79+
<!-- First line is just text, so all lines make up the paragraph -->
80+
<xsl:sequence select="m2t:paragraph($lines)"/>
8581
</xsl:otherwise>
8682
</xsl:choose>
8783
</xsl:for-each>
8884
</xsl:template>
8985

90-
<!-- Handle Raw text nodes inside some container Element -->
91-
<!-- xsl:template match="text()" mode="markdown-to-tei" priority="-1">
92-
<xsl:for-each select="tokenize(., '\n[ \t\r]*\n+')[not(matches(., '^[ \t\r]+$'))][string-length(.) gt 0]">
93-
<xsl:variable name="lines" select="tokenize(., '\n')[not(matches(., '^[ \t\r]+$'))][string-length(.) gt 0]"/>
94-
<tei:p><xsl:for-each select="(1 to count($lines))"><xsl:variable name="i" select="." as="xs:integer"/><xsl:if test="$i gt 1"><tei:lb/><xsl:text>&#xA;</xsl:text></xsl:if><xsl:value-of select="$lines[$i]"/></xsl:for-each></tei:p>
86+
<!-- Replace non-TEI elements with tei:p if they don't have an ancestor tei:p -->
87+
<xsl:template match="element()[namespace-uri(.) ne 'http://www.tei-c.org/ns/1.0'][empty(ancestor::tei:p)]" mode="markdown-to-tei">
88+
<tei:p>
89+
<xsl:for-each select="node()">
90+
<xsl:choose>
91+
<xsl:when test=". instance of element() and namespace-uri(.) ne 'http://www.tei-c.org/ns/1.0'">
92+
<xsl:apply-templates select="./node()" mode="non-tei-container"/>
93+
</xsl:when>
94+
<xsl:otherwise>
95+
<xsl:apply-templates select="." mode="non-tei-container"/>
96+
</xsl:otherwise>
97+
</xsl:choose>
98+
</xsl:for-each>
99+
</tei:p>
100+
</xsl:template>
101+
102+
<!-- Drop non-TEI elements if they have an ancestor tei:p -->
103+
<xsl:template match="element()[namespace-uri(.) ne 'http://www.tei-c.org/ns/1.0'][ancestor::tei:p]" mode="markdown-to-tei">
104+
<xsl:for-each select="node()">
105+
<xsl:choose>
106+
<xsl:when test=". instance of element() and namespace-uri(.) ne 'http://www.tei-c.org/ns/1.0'">
107+
<xsl:apply-templates select="./node()" mode="non-tei-container"/>
108+
</xsl:when>
109+
<xsl:otherwise>
110+
<xsl:apply-templates select="." mode="non-tei-container"/>
111+
</xsl:otherwise>
112+
</xsl:choose>
95113
</xsl:for-each>
96-
</xsl:template -->
114+
</xsl:template>
115+
116+
<!-- Used above to replace non-TEI elements with tei:p -->
117+
<xsl:template match="node()|@*" mode="non-tei-container">
118+
<xsl:copy>
119+
<xsl:for-each select="node()|@*">
120+
<xsl:choose>
121+
<xsl:when test=". instance of element() and namespace-uri(.) ne 'http://www.tei-c.org/ns/1.0'">
122+
<xsl:apply-templates select="./node()" mode="#current"/>
123+
</xsl:when>
124+
<xsl:otherwise>
125+
<xsl:apply-templates select="." mode="#current"/>
126+
</xsl:otherwise>
127+
</xsl:choose>
128+
</xsl:for-each>
129+
</xsl:copy>
130+
</xsl:template>
97131

98132
<!-- Default: Identity trasform everything -->
99133
<xsl:template match="node()|@*" mode="markdown-to-tei" priority="-3">
100134
<xsl:copy>
101135
<xsl:apply-templates select="node()|@*" mode="#current"/>
102136
</xsl:copy>
103137
</xsl:template>
138+
139+
<xsl:function name="m2t:paragraph" as="node()*">
140+
<xsl:param name="lines" as="item()*" required="yes"/>
141+
<xsl:if test="exists($lines)">
142+
<tei:p>
143+
<xsl:for-each select="(1 to count($lines))">
144+
<xsl:variable name="i" select="." as="xs:integer"/>
145+
<xsl:variable name="line" select="$lines[$i]"/>
146+
<xsl:if test="$i gt 1"><tei:lb/><xsl:text>&#xA;</xsl:text></xsl:if>
147+
<xsl:sequence select="m2t:bold-and-italic($line)"/>
148+
</xsl:for-each>
149+
</tei:p>
150+
</xsl:if>
151+
</xsl:function>
104152

105153
<xsl:function name="m2t:h1" as="node()+">
106-
<xsl:param name="markdown" required="true"/>
154+
<xsl:param name="markdown" required="yes"/>
107155
<xsl:analyze-string select="$markdown" regex="(^|[^#])#\s+(.+)">
108156
<xsl:matching-substring>
109157
<xsl:if test="m2t:non-empty(regex-group(1))"><xsl:value-of select="regex-group(1)"/></xsl:if><tei:hi rend="h1"><xsl:sequence select="m2t:bold-and-italic(normalize-space(regex-group(2)))"/></tei:hi>
@@ -115,7 +163,7 @@
115163
</xsl:function>
116164

117165
<xsl:function name="m2t:h2" as="node()+">
118-
<xsl:param name="markdown" required="true"/>
166+
<xsl:param name="markdown" required="yes"/>
119167
<xsl:analyze-string select="$markdown" regex="(^|[^#])##\s+(.+)">
120168
<xsl:matching-substring>
121169
<xsl:if test="m2t:non-empty(regex-group(1))"><xsl:value-of select="regex-group(1)"/></xsl:if><tei:hi rend="h2"><xsl:sequence select="m2t:bold-and-italic(normalize-space(regex-group(2)))"/></tei:hi>
@@ -127,7 +175,7 @@
127175
</xsl:function>
128176

129177
<xsl:function name="m2t:h3" as="node()+">
130-
<xsl:param name="markdown" required="true"/>
178+
<xsl:param name="markdown" required="yes"/>
131179
<xsl:analyze-string select="$markdown" regex="(^|[^#])###\s+(.+)">
132180
<xsl:matching-substring>
133181
<xsl:if test="m2t:non-empty(regex-group(1))"><xsl:value-of select="regex-group(1)"/></xsl:if><tei:hi rend="h3"><xsl:sequence select="m2t:bold-and-italic(normalize-space(regex-group(2)))"/></tei:hi>
@@ -139,7 +187,7 @@
139187
</xsl:function>
140188

141189
<xsl:function name="m2t:bold" as="node()+">
142-
<xsl:param name="markdown" required="true"/>
190+
<xsl:param name="markdown" required="yes"/>
143191
<xsl:analyze-string select="$markdown" regex="(^|[^\\])\*\*([^*]*[^*\\])\*\*">
144192
<xsl:matching-substring>
145193
<xsl:if test="m2t:non-empty(regex-group(1))"><xsl:value-of select="regex-group(1)"/></xsl:if><tei:emph rend="bold"><xsl:value-of select="regex-group(2)"/></tei:emph>
@@ -151,7 +199,7 @@
151199
</xsl:function>
152200

153201
<xsl:function name="m2t:italic" as="node()+">
154-
<xsl:param name="markdown" required="true"/>
202+
<xsl:param name="markdown" required="yes"/>
155203
<xsl:analyze-string select="$markdown" regex="(^|[^*\\])\*([^*]*[^*\\])\*([^*]|$)">
156204
<xsl:matching-substring>
157205
<xsl:if test="m2t:non-empty(regex-group(1))"><xsl:value-of select="regex-group(1)"/></xsl:if><tei:emph rend="italic"><xsl:value-of select="regex-group(2)"/></tei:emph><xsl:if test="m2t:non-empty(regex-group(3))"><xsl:value-of select="regex-group(3)"/></xsl:if>
@@ -163,14 +211,14 @@
163211
</xsl:function>
164212

165213
<xsl:function name="m2t:bold-and-italic" as="node()+">
166-
<xsl:param name="markdown" required="true"/>
214+
<xsl:param name="markdown" required="yes"/>
167215
<xsl:variable name="after-bold" select="for $x in $markdown return if ($x instance of text() or $x instance of xs:string) then m2t:bold($x) else $x"/>
168216
<xsl:variable name="after-italic" select="for $x in $after-bold return if ($x instance of text() or $x instance of xs:string) then m2t:italic($x) else $x"/>
169217
<xsl:sequence select="$after-italic"/>
170218
</xsl:function>
171219

172220
<xsl:function name="m2t:non-empty" as="xs:string*">
173-
<xsl:param name="inputs" as="xs:string*" required="true"/>
221+
<xsl:param name="inputs" as="xs:string*" required="yes"/>
174222
<xsl:sequence select="$inputs[string-length(.) gt 0]"/>
175223
</xsl:function>
176224

src/test/xar-resources/services/manuforma-form-to-tei.xspec

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,10 @@
7777
</x:context>
7878

7979
<x:expect label="Then copy as is">
80+
<tei:note>
8081
<tei:p><tei:emph rend="bold">Hello</tei:emph> there and welcome</tei:p>
8182
<tei:p>Goodbye, and see you <tei:emph rend="italic">again</tei:emph> soon</tei:p>
83+
</tei:note>
8284
</x:expect>
8385

8486
</x:scenario>
@@ -92,8 +94,10 @@
9294
</x:context>
9395

9496
<x:expect label="Then copy as is">
97+
<tei:note>
9598
<tei:p><tei:emph rend="bold">Hello</tei:emph> there and welcome</tei:p>
9699
<tei:p>Goodbye, and see you <tei:emph rend="italic">again</tei:emph> soon</tei:p>
100+
</tei:note>
97101
</x:expect>
98102

99103
</x:scenario>
@@ -366,15 +370,15 @@ Paragraph 3-3.</tei:note>
366370
<x:context>
367371
<tei:TEI>
368372
<tei:note>Once &#xD;upon&#xD;&#xA;a time&#xD; in the &#xD;&#xA;&#xD; West
369-
373+
370374
There was a little Panda called Jennifer
371-
375+
372376
who was a sleepy little panda</tei:note>
373377
</tei:TEI>
374378
</x:context>
375-
379+
376380
<!-- TODO(AR) check this with Max - should &#xD;&#xA;&#xD; produce one or two <tei:lb> in the output -->
377-
381+
378382
<x:expect label="Wrap each paragraph in a p element abd normalize paragraph breaks with lb elements">
379383
<tei:TEI>
380384
<tei:note>

0 commit comments

Comments
 (0)