{"id":5409,"date":"2017-05-15T16:38:49","date_gmt":"2017-05-15T07:38:49","guid":{"rendered":"https:\/\/gunmagisgeek.com\/wordpress\/?p=5409"},"modified":"2017-09-25T16:43:09","modified_gmt":"2017-09-25T07:43:09","slug":"%e5%8a%b4%e5%83%8d%e9%96%a2%e4%bf%82%e6%b3%95%e9%81%95%e5%8f%8d%e3%83%aa%e3%82%b9%e3%83%88%e3%82%92csv%e3%81%ab%e3%81%97%e3%81%a6%e3%81%bf%e3%81%9f%e3%80%82","status":"publish","type":"post","link":"https:\/\/gunmagisgeek.com\/blog\/node-js\/5409","title":{"rendered":"\u52b4\u50cd\u95a2\u4fc2\u6cd5\u9055\u53cd\u30ea\u30b9\u30c8\u3092csv\u306b\u3057\u3066\u307f\u305f\u3002"},"content":{"rendered":"<p>\u539a\u751f\u52b4\u50cd\u7701\u304c\u516c\u958b\u3057\u305f\u300c<a href=\"http:\/\/www.mhlw.go.jp\/kinkyu\/151106.html\">\u52b4\u50cd\u57fa\u6e96\u95a2\u4fc2\u6cd5\u4ee4\u9055\u53cd\u306b\u4fc2\u308b\u516c\u8868\u4e8b\u6848<\/a>\u300d\u306ePDF\u304b\u3089\u30c6\u30fc\u30d6\u30eb\u30c7\u30fc\u30bf\u3092\u629c\u304d\u51fa\u3057\u3066csv\u5316\u3057\u3066\u307f\u307e\u3057\u305f\u305f\u3002<\/p>\n<p>update: 2017\/9\/15<br \/>\n<a href=\"https:\/\/github.com\/shimizu\/black_-company_list\/blob\/master\/csv\/170510-01.csv\">170510-01.csv<\/a><\/p>\n<p><a href=\"https:\/\/github.com\/ashima\/pdf-table-extract\">pdf-table-extract<\/a>\u3092\u4f7f\u3063\u3066\u30b5\u30af\u30c3\u3068\u3067\u304d\u305d\u3046\uff01 \u3068\u304b\u601d\u3063\u3066\u3044\u305f\u306e\u3067\u3059\u304c\u3001\u30bb\u30eb\u5185\u306b\u6539\u884c\u3084\u5168\u89d2\u30fb\u534a\u89d2\u7a7a\u767d\u304c\u6df7\u3058\u3063\u3066\u3044\u308b\u3068\u3046\u307e\u304f\u30d1\u30fc\u30b9\u3067\u304d\u306a\u3044\u3089\u3057\u3044\u3067\u3059\u3002<\/p>\n<p>\u4ed5\u65b9\u304c\u306a\u3044\u306e\u3067\u3001\u4e00\u90e8\u5f37\u5f15\u306a\u65b9\u6cd5\u3067\u6574\u5f62\u3057\u307e\u3057\u305f\u3002<\/p>\n<p>[\u8ffd\u8a18]<br \/>\n\u516c\u958b\u3055\u308c\u3066\u3044\u308bpdf\u3001\u3061\u3083\u3093\u3068\u6bce\u6708\u66f4\u65b0\u3055\u308c\u3066\u3044\u308b\u3093\u3060\u3051\u3069\u3001\u30d5\u30a1\u30a4\u30eb\u540d\u304c170510-01.pdf\u306e\u307e\u307e\u8ffd\u8a18\u3055\u308c\u3066\u3044\u304f\u5f62\u5f0f\u306a\u306e\u3067\u308f\u304b\u308a\u305a\u3089\u3044\u3002<\/p>\n<h2>\u30b5\u30f3\u30d7\u30eb\u30b3\u30fc\u30c9<\/h2>\n<pre class=\"lang:js decode:true \" title=\"convert.js\" >const pdf_table_extractor = require(\"pdf-table-extractor\")\r\nconst d3 = require(\"d3-dsv\") \r\n \r\nconst fileName = process.argv[2] || null\r\n\r\nconst shaping = (json) =&gt; {\r\n    const head = json.pageTables[0].tables[0]\r\n    \r\n    const pageTables = json.pageTables.map( page =&gt; page.tables.filter((tr,i) =&gt; i &gt; 0) )\r\n    \r\n    const flattenTable = Array.prototype.concat.apply([], pageTables)\r\n    \r\n    const deletedLF = flattenTable.map( tr =&gt; tr.map(td =&gt; td.replace(\/\\n\/g, \"\")) )\r\n    \r\n\r\n    const cleand = deletedLF.map((tr =&gt; {\r\n        if (tr[1]===\"\" &amp;&amp; tr[2]===\"\" &amp;&amp; tr[3] ==\"\"){\r\n            let split = tr[0].split(\/\\s\/).filter(s =&gt; s !== \"\" )\r\n            \r\n            if(split.length &gt; 4){\r\n                split[0] += split[1]\r\n                delete split[1]\r\n                let tmp = split.filter(s =&gt; s !== null)\r\n                split = tmp\r\n            }\r\n            tr[0] = split[0]\r\n            tr[1] = split[1]\r\n            tr[2] = split[2]\r\n            tr[3] = split[3]\r\n            \r\n        }\r\n        else if (tr[1]===\"\" &amp;&amp; tr[2]===\"\"){\r\n            let split = tr[0].split(\/\\s\/).filter(s =&gt; s !== \"\" )\r\n            if(split.length &gt; 3){\r\n                split[0] += split[1]\r\n                delete split[1]\r\n                let tmp = split.filter(s =&gt; s !== null)\r\n                split = tmp\r\n            }\r\n            tr[0] = split[0]\r\n            tr[1] = split[1]\r\n            tr[2] = split[2]\r\n            \r\n        }\r\n        else if (tr[2]===\"\" &amp;&amp; tr[3]===\"\"){\r\n            let split = tr[1].split(\/\\s\/).filter(s =&gt; s !== \"\" )\r\n            if(split.length &gt; 3){\r\n                split[0] += split[1]\r\n                delete split[1]\r\n                let tmp = split.filter(s =&gt; s !== null)\r\n                split = tmp\r\n            }\r\n            tr[1] = split[0]\r\n            tr[2] = split[1]\r\n            tr[3] = split[2]\r\n            \r\n            \r\n        }\r\n        else if (tr[2]==\"\"){\r\n            let split = tr[1].split(\/\\s\/).filter(s =&gt; s !== \"\" )\r\n            tr[1] = split[0]\r\n            tr[2] = split[1]        \r\n        }\r\n        else  if (tr[3] ==\"\"){\r\n            let split = tr[2].split(\/\\s\/).filter(s =&gt; s !== \"\" )\r\n            tr[2] = split[0]\r\n            tr[3] = split[1]        \r\n        }\r\n        \r\n        return tr\r\n    }))\r\n    \r\n    const result = cleand.map(d =&gt; {\r\n        let obj = {}\r\n        head.forEach((key,i) =&gt; {\r\n            obj[key] = d[i]\r\n        })\r\n        return obj\r\n    })\r\n        \r\n    return result\r\n}\r\n\r\n\r\nconst success = (result) =&gt; {\r\n    const json = shaping(result)    \r\n    const csv = d3.csvFormat(json)\r\n    \r\n    console.log('\\ufeff'+csv)\r\n}\r\n \r\nconst error = (err) =&gt; {\r\n   console.error('Error: ' + err)\r\n}\r\n\r\n\r\nif(!fileName){\r\n    console.log(\"\u30d5\u30a1\u30a4\u30eb\u540d\u3092\u6307\u5b9a\u3057\u3066\u304f\u3060\u3055\u3044\")\r\n}else{\r\n    pdf_table_extractor(fileName,success,error)    \r\n}\r\n\r\n<\/pre>\n<p>\u5b9f\u884c <\/p>\n<pre class=\"lang:sh decode:true \" >$ node convert.js 170510-01.pdf &gt; 170510-01.csv<\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u539a\u751f\u52b4\u50cd\u7701\u304c\u516c\u958b\u3057\u305f\u300c\u52b4\u50cd\u57fa\u6e96\u95a2\u4fc2\u6cd5\u4ee4\u9055\u53cd\u306b\u4fc2\u308b\u516c&hellip;<\/p>\n","protected":false},"author":1,"featured_media":5410,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[5],"tags":[],"class_list":["post-5409","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-node-js","has-post-thumbnail-archive"],"_links":{"self":[{"href":"https:\/\/gunmagisgeek.com\/blog\/wp-json\/wp\/v2\/posts\/5409","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/gunmagisgeek.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/gunmagisgeek.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/gunmagisgeek.com\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/gunmagisgeek.com\/blog\/wp-json\/wp\/v2\/comments?post=5409"}],"version-history":[{"count":0,"href":"https:\/\/gunmagisgeek.com\/blog\/wp-json\/wp\/v2\/posts\/5409\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/gunmagisgeek.com\/blog\/wp-json\/wp\/v2\/media\/5410"}],"wp:attachment":[{"href":"https:\/\/gunmagisgeek.com\/blog\/wp-json\/wp\/v2\/media?parent=5409"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/gunmagisgeek.com\/blog\/wp-json\/wp\/v2\/categories?post=5409"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/gunmagisgeek.com\/blog\/wp-json\/wp\/v2\/tags?post=5409"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}