github.com/elliott5/community@v0.14.1-0.20160709191136-823126fb026a/documize/api/convert/html/html_test.go (about)

     1  // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
     2  //
     3  // This software (Documize Community Edition) is licensed under 
     4  // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
     5  //
     6  // You can operate outside the AGPL restrictions by purchasing
     7  // Documize Enterprise Edition and obtaining a commercial license
     8  // by contacting <sales@documize.com>. 
     9  //
    10  // https://documize.com
    11  
    12  package html_test
    13  
    14  import (
    15  	"strings"
    16  	"testing"
    17  )
    18  import "github.com/documize/community/wordsmith/api"
    19  import "github.com/documize/community/documize/api/convert/html"
    20  
    21  const b string = `
    22  <h1>Markdown: Basics</h1>
    23  
    24  <ul id="ProjectSubmenu">
    25      <li><a href="/projects/markdown/" title="Markdown Project Page">Main</a></li>
    26      <li><a class="selected" title="Markdown Basics">Basics</a></li>
    27      <li><a href="/projects/markdown/syntax" title="Markdown Syntax Documentation">Syntax</a></li>
    28      <li><a href="/projects/markdown/license" title="Pricing and License Information">License</a></li>
    29      <li><a href="/projects/markdown/dingus" title="Online Markdown Web Form">Dingus</a></li>
    30  </ul>
    31  
    32  <h2>Getting the Gist of Markdown's Formatting Syntax</h2>
    33  
    34  <p>This page offers a brief overview of what it's like to use Markdown.
    35  The <a href="/projects/markdown/syntax" title="Markdown Syntax">syntax page</a> provides complete, detailed documentation for
    36  every feature, but Markdown should be very easy to pick up simply by
    37  looking at a few examples of it in action. The examples on this page
    38  are written in a before/after style, showing example syntax and the
    39  HTML output produced by Markdown.</p>
    40  
    41  <p>It's also helpful to simply try Markdown out; the <a href="/projects/markdown/dingus" title="Markdown Dingus">Dingus</a> is a
    42  web application that allows you type your own Markdown-formatted text
    43  and translate it to XHTML.</p>
    44  
    45  <p><strong>Note:</strong> This document is itself written using Markdown; you
    46  can <a href="/projects/markdown/basics.text">see the source for it by adding '.text' to the URL</a>.</p>
    47  
    48  <h2>Paragraphs, Headers, Blockquotes</h2>
    49  
    50  <p>A paragraph is simply one or more consecutive lines of text, separated
    51  by one or more blank lines. (A blank line is any line that looks like a
    52  blank line -- a line containing nothing spaces or tabs is considered
    53  blank.) Normal paragraphs should not be intended with spaces or tabs.</p>
    54  
    55  <p>Markdown offers two styles of headers: <em>Setext</em> and <em>atx</em>.
    56  Setext-style headers for <code>&lt;h1&gt;</code> and <code>&lt;h2&gt;</code> are created by
    57  &quot;underlining&quot; with equal signs (<code>=</code>) and hyphens (<code>-</code>), respectively.
    58  To create an atx-style header, you put 1-6 hash marks (<code>#</code>) at the
    59  beginning of the line -- the number of hashes equals the resulting
    60  HTML header level.</p>
    61  
    62  <p>Blockquotes are indicated using email-style '<code>&gt;</code>' angle brackets.</p>
    63  
    64  <p>Markdown:</p>
    65  
    66  <pre><code>A First Level Header
    67  ====================
    68  
    69  A Second Level Header
    70  ---------------------
    71  
    72  Now is the time for all good men to come to
    73  the aid of their country. This is just a
    74  regular paragraph.
    75  
    76  The quick brown fox jumped over the lazy
    77  dog's back.
    78  
    79  ### Header 3
    80  
    81  &gt; This is a blockquote.
    82  &gt; 
    83  &gt; This is the second paragraph in the blockquote.
    84  &gt;
    85  &gt; ## This is an H2 in a blockquote
    86  </code></pre>
    87  
    88  <p>Output:</p>
    89  
    90  <pre><code>&lt;h1&gt;A First Level Header&lt;/h1&gt;
    91  
    92  &lt;h2&gt;A Second Level Header&lt;/h2&gt;
    93  
    94  &lt;p&gt;Now is the time for all good men to come to
    95  the aid of their country. This is just a
    96  regular paragraph.&lt;/p&gt;
    97  
    98  &lt;p&gt;The quick brown fox jumped over the lazy
    99  dog's back.&lt;/p&gt;
   100  
   101  &lt;h3&gt;Header 3&lt;/h3&gt;
   102  
   103  &lt;blockquote&gt;
   104      &lt;p&gt;This is a blockquote.&lt;/p&gt;
   105  
   106      &lt;p&gt;This is the second paragraph in the blockquote.&lt;/p&gt;
   107  
   108      &lt;h2&gt;This is an H2 in a blockquote&lt;/h2&gt;
   109  &lt;/blockquote&gt;
   110  </code></pre>
   111  
   112  <h3>Phrase Emphasis</h3>
   113  
   114  <p>Markdown uses asterisks and underscores to indicate spans of emphasis.</p>
   115  
   116  <p>Markdown:</p>
   117  
   118  <pre><code>Some of these words *are emphasized*.
   119  Some of these words _are emphasized also_.
   120  
   121  Use two asterisks for **strong emphasis**.
   122  Or, if you prefer, __use two underscores instead__.
   123  </code></pre>
   124  
   125  <p>Output:</p>
   126  
   127  <pre><code>&lt;p&gt;Some of these words &lt;em&gt;are emphasized&lt;/em&gt;.
   128  Some of these words &lt;em&gt;are emphasized also&lt;/em&gt;.&lt;/p&gt;
   129  
   130  &lt;p&gt;Use two asterisks for &lt;strong&gt;strong emphasis&lt;/strong&gt;.
   131  Or, if you prefer, &lt;strong&gt;use two underscores instead&lt;/strong&gt;.&lt;/p&gt;
   132  </code></pre>
   133  
   134  <h2>Lists</h2>
   135  
   136  <p>Unordered (bulleted) lists use asterisks, pluses, and hyphens (<code>*</code>,
   137  <code>+</code>, and <code>-</code>) as list markers. These three markers are
   138  interchangable; this:</p>
   139  
   140  <pre><code>*   Candy.
   141  *   Gum.
   142  *   Booze.
   143  </code></pre>
   144  
   145  <p>this:</p>
   146  
   147  <pre><code>+   Candy.
   148  +   Gum.
   149  +   Booze.
   150  </code></pre>
   151  
   152  <p>and this:</p>
   153  
   154  <pre><code>-   Candy.
   155  -   Gum.
   156  -   Booze.
   157  </code></pre>
   158  
   159  <p>all produce the same output:</p>
   160  
   161  <pre><code>&lt;ul&gt;
   162  &lt;li&gt;Candy.&lt;/li&gt;
   163  &lt;li&gt;Gum.&lt;/li&gt;
   164  &lt;li&gt;Booze.&lt;/li&gt;
   165  &lt;/ul&gt;
   166  </code></pre>
   167  
   168  <p>Ordered (numbered) lists use regular numbers, followed by periods, as
   169  list markers:</p>
   170  
   171  <pre><code>1.  Red
   172  2.  Green
   173  3.  Blue
   174  </code></pre>
   175  
   176  <p>Output:</p>
   177  
   178  <pre><code>&lt;ol&gt;
   179  &lt;li&gt;Red&lt;/li&gt;
   180  &lt;li&gt;Green&lt;/li&gt;
   181  &lt;li&gt;Blue&lt;/li&gt;
   182  &lt;/ol&gt;
   183  </code></pre>
   184  
   185  <p>If you put blank lines between items, you'll get <code>&lt;p&gt;</code> tags for the
   186  list item text. You can create multi-paragraph list items by indenting
   187  the paragraphs by 4 spaces or 1 tab:</p>
   188  
   189  <pre><code>*   A list item.
   190  
   191      With multiple paragraphs.
   192  
   193  *   Another item in the list.
   194  </code></pre>
   195  
   196  <p>Output:</p>
   197  
   198  <pre><code>&lt;ul&gt;
   199  &lt;li&gt;&lt;p&gt;A list item.&lt;/p&gt;
   200  &lt;p&gt;With multiple paragraphs.&lt;/p&gt;&lt;/li&gt;
   201  &lt;li&gt;&lt;p&gt;Another item in the list.&lt;/p&gt;&lt;/li&gt;
   202  &lt;/ul&gt;
   203  </code></pre>
   204  
   205  <h3>Links</h3>
   206  
   207  <p>Markdown supports two styles for creating links: <em>inline</em> and
   208  <em>reference</em>. With both styles, you use square brackets to delimit the
   209  text you want to turn into a link.</p>
   210  
   211  <p>Inline-style links use parentheses immediately after the link text.
   212  For example:</p>
   213  
   214  <pre><code>This is an [example link](http://example.com/).
   215  </code></pre>
   216  
   217  <p>Output:</p>
   218  
   219  <pre><code>&lt;p&gt;This is an &lt;a href=&quot;http://example.com/&quot;&gt;
   220  example link&lt;/a&gt;.&lt;/p&gt;
   221  </code></pre>
   222  
   223  <p>Optionally, you may include a title attribute in the parentheses:</p>
   224  
   225  <pre><code>This is an [example link](http://example.com/ &quot;With a Title&quot;).
   226  </code></pre>
   227  
   228  <p>Output:</p>
   229  
   230  <pre><code>&lt;p&gt;This is an &lt;a href=&quot;http://example.com/&quot; title=&quot;With a Title&quot;&gt;
   231  example link&lt;/a&gt;.&lt;/p&gt;
   232  </code></pre>
   233  
   234  <p>Reference-style links allow you to refer to your links by names, which
   235  you define elsewhere in your document:</p>
   236  
   237  <pre><code>I get 10 times more traffic from [Google][1] than from
   238  [Yahoo][2] or [MSN][3].
   239  
   240  [1]: http://google.com/        &quot;Google&quot;
   241  [2]: http://search.yahoo.com/  &quot;Yahoo Search&quot;
   242  [3]: http://search.msn.com/    &quot;MSN Search&quot;
   243  </code></pre>
   244  
   245  <p>Output:</p>
   246  
   247  <pre><code>&lt;p&gt;I get 10 times more traffic from &lt;a href=&quot;http://google.com/&quot;
   248  title=&quot;Google&quot;&gt;Google&lt;/a&gt; than from &lt;a href=&quot;http://search.yahoo.com/&quot;
   249  title=&quot;Yahoo Search&quot;&gt;Yahoo&lt;/a&gt; or &lt;a href=&quot;http://search.msn.com/&quot;
   250  title=&quot;MSN Search&quot;&gt;MSN&lt;/a&gt;.&lt;/p&gt;
   251  </code></pre>
   252  
   253  <p>The title attribute is optional. Link names may contain letters,
   254  numbers and spaces, but are <em>not</em> case sensitive:</p>
   255  
   256  <pre><code>I start my morning with a cup of coffee and
   257  [The New York Times][NY Times].
   258  
   259  [ny times]: http://www.nytimes.com/
   260  </code></pre>
   261  
   262  <p>Output:</p>
   263  
   264  <pre><code>&lt;p&gt;I start my morning with a cup of coffee and
   265  &lt;a href=&quot;http://www.nytimes.com/&quot;&gt;The New York Times&lt;/a&gt;.&lt;/p&gt;
   266  </code></pre>
   267  
   268  <h3>Images</h3>
   269  
   270  <p>Image syntax is very much like link syntax.</p>
   271  
   272  <p>Inline (titles are optional):</p>
   273  
   274  <pre><code>![alt text](/path/to/img.jpg &quot;Title&quot;)
   275  </code></pre>
   276  
   277  <p>Reference-style:</p>
   278  
   279  <pre><code>![alt text][id]
   280  
   281  [id]: /path/to/img.jpg &quot;Title&quot;
   282  </code></pre>
   283  
   284  <p>Both of the above examples produce the same output:</p>
   285  
   286  <pre><code>&lt;img src=&quot;/path/to/img.jpg&quot; alt=&quot;alt text&quot; title=&quot;Title&quot; /&gt;
   287  </code></pre>
   288  
   289  <h3>Code</h3>
   290  
   291  <p>In a regular paragraph, you can create code span by wrapping text in
   292  backtick quotes. Any ampersands (<code>&amp;</code>) and angle brackets (<code>&lt;</code> or
   293  <code>&gt;</code>) will automatically be translated into HTML entities. This makes
   294  it easy to use Markdown to write about HTML example code:</p>
   295  
   296  <pre><code>I strongly recommend against using any "&lt;blink&gt;" tags.
   297  
   298  I wish SmartyPants used named entities like "&amp;mdash;""
   299  instead of decimal-encoded entites like "&amp;#8212;".
   300  </code></pre>
   301  
   302  <p>Output:</p>
   303  
   304  <pre><code>&lt;p&gt;I strongly recommend against using any
   305  &lt;code&gt;&amp;lt;blink&amp;gt;&lt;/code&gt; tags.&lt;/p&gt;
   306  
   307  &lt;p&gt;I wish SmartyPants used named entities like
   308  &lt;code&gt;&amp;amp;mdash;&lt;/code&gt; instead of decimal-encoded
   309  entites like &lt;code&gt;&amp;amp;#8212;&lt;/code&gt;.&lt;/p&gt;
   310  </code></pre>
   311  
   312  <p>To specify an entire block of pre-formatted code, indent every line of
   313  the block by 4 spaces or 1 tab. Just like with code spans, <code>&amp;</code>, <code>&lt;</code>,
   314  and <code>&gt;</code> characters will be escaped automatically.</p>
   315  
   316  <p>Markdown:</p>
   317  
   318  <pre><code>If you want your page to validate under XHTML 1.0 Strict,
   319  you've got to put paragraph tags in your blockquotes:
   320  
   321      &lt;blockquote&gt;
   322          &lt;p&gt;For example.&lt;/p&gt;
   323      &lt;/blockquote&gt;
   324  </code></pre>
   325  
   326  <p>Output:</p>
   327  
   328  <pre><code>&lt;p&gt;If you want your page to validate under XHTML 1.0 Strict,
   329  you've got to put paragraph tags in your blockquotes:&lt;/p&gt;
   330  
   331  &lt;pre&gt;&lt;code&gt;&amp;lt;blockquote&amp;gt;
   332      &amp;lt;p&amp;gt;For example.&amp;lt;/p&amp;gt;
   333  &amp;lt;/blockquote&amp;gt;
   334  &lt;/code&gt;&lt;/pre&gt;
   335  </code></pre>
   336  
   337  <h4>Header4</h4>
   338  <div><div><div><div><div><div>
   339  <h5>Header5</h5>Body 555.
   340  </div></div></div></div></div></div>
   341  <h6>Header6</h6>
   342  
   343  `
   344  
   345  func TestHTML(t *testing.T) {
   346  
   347  	req := &api.DocumentConversionRequest{}
   348  	res := &api.DocumentConversionResponse{}
   349  
   350  	err := html.SplitIfHTML(req, res)
   351  	if err != nil || len(res.PagesHTML) != 0 || len(res.Pages) != 0 || len(res.EmbeddedFiles) != 0 {
   352  		t.Error(err)
   353  		return
   354  	}
   355  
   356  	titleTooBig := []byte("<h1>")
   357  	for i := 0; i < 2048; i++ {
   358  		titleTooBig = append(titleTooBig, []byte("title too long ")...)
   359  	}
   360  	titleTooBig = append(titleTooBig, []byte("</h1>")...)
   361  	req = &api.DocumentConversionRequest{}
   362  	res = &api.DocumentConversionResponse{PagesHTML: titleTooBig}
   363  	err = html.SplitIfHTML(req, res)
   364  	if err != nil || len(res.Pages[0].Title) > 2000 {
   365  		t.Error(err)
   366  		return
   367  	}
   368  
   369  	req = &api.DocumentConversionRequest{}
   370  	res = &api.DocumentConversionResponse{PagesHTML: []byte(b)}
   371  	err = html.SplitIfHTML(req, res)
   372  	if err != nil {
   373  		t.Error(err)
   374  		return
   375  	}
   376  	//for p, pg := range res.Pages {
   377  	//	t.Logf("%d %d %d %s", p, pg.Level, len(pg.Body), pg.Title)
   378  	//}
   379  	if !strings.HasPrefix(res.Pages[10].Title, "Header5") ||
   380  		!strings.HasPrefix(string(res.Pages[10].Body), "Body 555.") {
   381  		t.Errorf("wrong page ten title: `%s` body: `%s`", res.Pages[10].Title, string(res.Pages[10].Body))
   382  	}
   383  
   384  }