Home Reference Source

lib/warcRecord/builder.js

  1. 'use strict'
  2. const WARCRecord = require('./record')
  3. const { crlf, begin } = require('./fieldIdentifiers')
  4.  
  5. /**
  6. * @type {{header: symbol, content1: symbol, content2: symbol, consumeCRLFHeader: symbol, consumeCRLFContent1: symbol, consumeCRLFContent2: symbol}}
  7. */
  8. const parsingStates = {
  9. header: Symbol('warc-parsing-header'),
  10. content1: Symbol('warc-parsing-content1'),
  11. content2: Symbol('warc-parsing-content2'),
  12. consumeCRLFHeader: Symbol('warc-parsing-comsume-crlf-header'),
  13. consumeCRLFContent1: Symbol('warc-parsing-comsume-crlf-c1'),
  14. consumeCRLFContent2: Symbol('warc-parsing-comsume-crlf-c2')
  15. }
  16.  
  17. /**
  18. * @type {number}
  19. */
  20. const WFIBeginLen = begin.length
  21.  
  22. /**
  23. * @param {Buffer} line
  24. * @returns {boolean}
  25. */
  26. function isJustCRLF (line) {
  27. if (line.length !== 2) return false
  28. return line[0] === crlf[0] && line[1] === crlf[1]
  29. }
  30.  
  31. /**
  32. * @param {Buffer} line
  33. * @returns {boolean}
  34. */
  35. function isWARCRevisionLine (line) {
  36. if (line.length > 11) return false
  37. let i = 0
  38. while (i < WFIBeginLen) {
  39. if (begin[i] !== line[i]) return false
  40. i += 1
  41. }
  42. return true
  43. }
  44.  
  45. /**
  46. * @desc Progressively builds warc records by consuming the file line by line
  47. */
  48. class RecordBuilder {
  49. /**
  50. * @desc Create a new RecordBuilder
  51. */
  52. constructor () {
  53. /**
  54. * @type {{header: Buffer[], c1: Buffer[], c2: Buffer[]}}
  55. * @private
  56. */
  57. this._parts = {
  58. header: [],
  59. c1: [],
  60. c2: []
  61. }
  62.  
  63. /**
  64. * @type {symbol}
  65. * @private
  66. */
  67. this._parsingState = parsingStates.header
  68. }
  69.  
  70. /**
  71. * @desc Returns a new WARC record if one can be created otherwise returns null
  72. * @returns {?WARCRecord}
  73. */
  74. buildRecord () {
  75. if (this._parts.header.length === 0) return null
  76. const newRecord = new WARCRecord(this._parts)
  77. this._parts.header = []
  78. this._parts.c1 = []
  79. this._parts.c2 = []
  80. return newRecord
  81. }
  82.  
  83. /**
  84. * @desc Consumes a line of a WARC file.
  85. * If a record can be built this function returns a new WARCRecord otherwise null
  86. * @param {Buffer} line - The line to be consumed
  87. * @returns {?WARCRecord}
  88. */
  89. consumeLine (line) {
  90. let newRecord = null
  91. if (isWARCRevisionLine(line)) {
  92. this._parsingState = parsingStates.header
  93. newRecord = this.buildRecord()
  94. }
  95. const isSep = isJustCRLF(line)
  96. switch (this._parsingState) {
  97. case parsingStates.header:
  98. if (!isSep) {
  99. this._parts.header.push(line)
  100. } else {
  101. this._parsingState = parsingStates.consumeCRLFHeader
  102. }
  103. break
  104. case parsingStates.consumeCRLFHeader:
  105. if (!isSep) {
  106. this._parts.c1.push(line)
  107. this._parsingState = parsingStates.content1
  108. }
  109. break
  110. case parsingStates.content1:
  111. if (!isSep) {
  112. this._parts.c1.push(line)
  113. } else {
  114. this._parsingState = parsingStates.consumeCRLFContent1
  115. }
  116. break
  117. case parsingStates.consumeCRLFContent1:
  118. if (!isSep) {
  119. this._parts.c2.push(line)
  120. this._parsingState = parsingStates.content2
  121. }
  122. break
  123. case parsingStates.content2:
  124. if (!isSep) {
  125. this._parts.c2.push(line)
  126. } else {
  127. this._parsingState = parsingStates.consumeCRLFContent2
  128. }
  129. break
  130. case parsingStates.consumeCRLFContent2:
  131. break
  132. }
  133. return newRecord
  134. }
  135. }
  136.  
  137. /**
  138. * @type {RecordBuilder}
  139. */
  140. module.exports = RecordBuilder