@@ -29,15 +29,31 @@ const skip = new Set([
2929 // Not in HTML
3030 25 , 54 ,
3131] ) ;
32+ const MAX_CONCURRENCY = 5 ;
33+ const REFETCH_OLD_VERSIONS = false ;
3234
33- async . each ( range ( 1 , MAX_REPORT ) , ( num , cb ) => {
35+ async . eachLimit ( range ( 1 , MAX_REPORT ) , MAX_CONCURRENCY , ( num , cb ) => {
3436 if ( skip . has ( num ) ) {
3537 console . log ( 'Skipping report #' + num ) ;
3638 cb ( ) ;
3739 return ;
3840 }
3941
40- const url = `https://www.unicode.org/reports/tr${ num } /` ;
42+ recurseStandard ( num , `https://www.unicode.org/reports/tr${ num } /` , null , cb ) ;
43+ } , ( err ) => {
44+ if ( err ) {
45+ console . log ( 'there was an error' ) ;
46+ console . error ( err ) ;
47+ return ;
48+ }
49+ const output = { } ;
50+ for ( const key of Object . keys ( current ) . sort ( ) ) {
51+ output [ key ] = current [ key ] ;
52+ }
53+ helper . writeBiblio ( FILENAME , output ) ;
54+ } ) ;
55+
56+ function recurseStandard ( num , url , latestId , cb ) {
4157 console . log ( 'Fetching' , url , '...' ) ;
4258 request ( {
4359 url,
@@ -53,13 +69,7 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
5369 console . log ( 'Parsing' , url , '...' ) ;
5470 const dom = new JSDOM ( body , { url } ) ;
5571 const { document } = dom . window ;
56- const type = document . title . slice ( 0 , 3 ) ;
57- if ( type !== 'UTS' && type !== 'UTR' && type !== 'UAX' ) {
58- console . log ( 'Unable to parse title' , document . title ) ;
59- cb ( ) ;
60- return ;
61- }
62- const id = type + num ;
72+
6373 const statusEl = document . querySelector ( '.body > h2' ) ;
6474 if ( ! statusEl ) {
6575 console . log ( 'Unable to find status' ) ;
@@ -68,6 +78,24 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
6878 }
6979 const status = trimText ( statusEl . textContent ) ;
7080
81+ let type = document . title . match ( / \b ( U T S | U T R | U A X ) / ) ;
82+ if ( type !== 'UTS' && type !== 'UTR' && type !== 'UAX' ) {
83+ // Fallback for https://www.unicode.org/reports/tr35/
84+ const lowerStatus = status . toLowerCase ( ) ;
85+ if ( lowerStatus . indexOf ( 'technical standard' ) != - 1 ) {
86+ type = 'UTS' ;
87+ } else if ( lowerStatus . indexOf ( 'standard annex' ) != - 1 ) {
88+ type = 'UAX' ;
89+ } else if ( lowerStatus . indexOf ( 'technical report' ) != - 1 ) {
90+ type = 'UTR' ;
91+ } else {
92+ console . log ( 'Unable to parse document type' ) ;
93+ cb ( ) ;
94+ return ;
95+ }
96+ }
97+ const thisId = type + num ;
98+
7199 const titleEl = statusEl . nextElementSibling ;
72100 if ( ! titleEl || titleEl . tagName !== 'H1' ) {
73101 console . log ( 'Unable to find title' ) ;
@@ -86,62 +114,102 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
86114 return ;
87115 }
88116
117+ if ( latestId == null ) {
118+ // This is first scanned document, so the latest version.
119+ latestId = thisId ;
120+
121+ const authors = infoTable . Editor && parseEditor ( infoTable . Editor ) ;
122+ if ( ! authors ) {
123+ console . log ( 'Unable to find/parse editors in table' ) ;
124+ cb ( ) ;
125+ return ;
126+ }
127+
128+ current [ thisId ] = {
129+ href : url ,
130+ authors,
131+ etAl : authors . etAl ,
132+ title,
133+ status,
134+ publisher : 'Unicode Consortium' ,
135+ versions : current [ latestId ] ?. versions ?? { }
136+ } ;
137+ } else if ( thisId != latestId ) {
138+ // The document was renamed at some point - create link
139+ current [ thisId ] = { aliasOf : latestId } ;
140+ }
141+
89142 const date = trimText ( infoTable . Date ) ;
90- if ( ! date ) {
143+ if ( ! date || ! / \d { 4 } - \d { 2 } - \d { 2 } / . test ( date ) ) {
91144 console . log ( 'Unable to find date in table' ) ;
92145 cb ( ) ;
93146 return ;
94147 }
95- let isRawDate = / \d { 4 } - \d { 2 } - \d { 2 } / . test ( date ) ;
96148
97- const href = processURL ( infoTable [ 'This Version' ] || url ) ;
149+ const href = processURL ( infoTable [ 'This Version' ] ) ;
150+ if ( ! href ) {
151+ console . log ( 'Failed to extract version URL' ) ;
152+ cb ( ) ;
153+ return ;
154+ }
98155
99- const authors = infoTable . Editor && parseEditor ( infoTable . Editor ) ;
100- if ( ! authors ) {
101- console . log ( 'Unable to find/parse editors in table ' ) ;
156+ const revision = parseRevision ( href ) ;
157+ if ( ! revision ) {
158+ console . log ( 'Failed to extract revision ' ) ;
102159 cb ( ) ;
103160 return ;
104161 }
105162
106- if ( type !== 'UAX' && current [ `UAX ${ num } ` ] )
107- current [ `UAX ${ num } ` ] = { aliasOf : id } ;
108- if ( type !== 'UTR' && current [ `UTR ${ num } ` ] )
109- current [ `UTR ${ num } ` ] = { aliasOf : id } ;
110- if ( type !== 'UTS' && current [ `UTS ${ num } ` ] )
111- current [ `UTS ${ num } ` ] = { aliasOf : id } ;
163+ const version = parseVersion ( infoTable . Version ) ;
164+
165+ if ( version )
166+ title = ` ${ title } version ${ version } ` ;
167+ else
168+ title = ` ${ title } revision ${ revision } ` ;
112169
113- current [ id ] = {
114- authors,
115- etAl : authors . etAl ,
170+ const wasAlreadyDefined = revision in current [ latestId ] . versions ;
171+ current [ latestId ] . versions [ revision ] = {
116172 href,
173+ rawDate : date ,
117174 title,
118- date : isRawDate ? undefined : date ,
119- rawDate : isRawDate ? date : undefined ,
120- status,
121- publisher : 'Unicode Consortium'
175+ status : current [ latestId ] . status != status ? status : undefined ,
122176 } ;
177+
178+ /*
179+ * If this revision was already defined, then don't waste time and bandwidth fetching
180+ * previous revisions which should have no changes.
181+ *
182+ * We're running this check after updating the information for this version in case this
183+ * is the latest and is a WIP, as we have already downloaded it anyway.
184+ */
185+ if ( ! wasAlreadyDefined || REFETCH_OLD_VERSIONS ) {
186+ const previousUrl = processURL ( infoTable [ 'Previous Version' ] ) ;
187+ if ( previousUrl ) {
188+ recurseStandard ( num , previousUrl , latestId , cb ) ;
189+ return ;
190+ }
191+ }
123192 cb ( ) ;
124193 } ) ;
125- } , ( err ) => {
126- if ( err ) {
127- console . log ( 'there was an error' ) ;
128- console . error ( err ) ;
129- return ;
130- }
131- const output = { } ;
132- for ( const key of Object . keys ( current ) . sort ( ) ) {
133- output [ key ] = current [ key ] ;
134- }
135- helper . writeBiblio ( FILENAME , output ) ;
136- } ) ;
194+ }
137195
138196function * range ( from , until ) {
139197 for ( let i = from ; i <= until ; i ++ )
140198 yield i ;
141199}
142200
143201function trimText ( str ) {
144- return str . replace ( / ® / g, '' ) . trim ( ) . replace ( / \s + / g, ' ' ) ;
202+ if ( ! str )
203+ return str ;
204+ str = str . replace ( / ® / g, '' ) . trim ( ) ;
205+
206+ // Replace consecutive newlines (with any surrounding spaces) with a single newline
207+ str = str . replace ( / [ \s - - \n ] * ( \n + [ \s - - \n ] * ) + / gv, '\n' ) ;
208+
209+ // Now replace all other spans of spaces, excluding new lines, with a single space
210+ str = str . replace ( / [ \s - - \n ] + / gv, ' ' ) ;
211+
212+ return str ;
145213}
146214
147215function titleCase ( str ) {
@@ -154,9 +222,9 @@ function gatherText(element) {
154222 if ( node . nodeType === node . ELEMENT_NODE && node . tagName === 'BR' )
155223 str += '\n' ;
156224 else
157- str += trimText ( node . textContent ) + ' ' ;
225+ str += node . textContent ;
158226 }
159- return str ;
227+ return trimText ( str ) ;
160228}
161229
162230function parseTable ( tableEl ) {
@@ -173,7 +241,16 @@ function parseTable(tableEl) {
173241}
174242
175243function processURL ( str ) {
176- return trimText ( str ) . replace ( / ^ h t t p : / , 'https:' ) ;
244+ if ( ! str )
245+ return null ;
246+ str = trimText ( str ) ;
247+ /*
248+ * Check for "Previous Version" in https://www.unicode.org/reports/tr38/tr38-5.html and
249+ * others, where it is "n/a".
250+ */
251+ if ( str . substring ( 0 , 4 ) != 'http' )
252+ return null ;
253+ return str . replace ( / ^ h t t p : / , 'https:' ) ;
177254}
178255
179256function parseEditor ( str ) {
@@ -184,3 +261,22 @@ function parseEditor(str) {
184261 }
185262 return arr ;
186263}
264+
265+ function parseRevision ( url ) {
266+ if ( ! url )
267+ return null ;
268+ /*
269+ * Find a in the URL the pattern "/tr<num>/tr<num>-<revision>". This works for the two cases:
270+ * - /tr<num>/tr<num>-<rev>/tr<num>.html (only UTS #35?)
271+ * - /tr<num>/tr<num>-<rev>.html (all others)
272+ */
273+ const match = url . match ( / \/ ( t r \d + ) \/ \1- (?< rev > \d + ) / , url ) ;
274+ return match ? match . groups . rev : null ;
275+ }
276+
277+ function parseVersion ( str ) {
278+ if ( ! str )
279+ return null ;
280+ // Some have "Unicode 11.0.0" instead of the version alone. Strip it.
281+ return trimText ( str ) . replace ( / ^ U n i c o d e \s * / , '' ) ;
282+ }
0 commit comments