1
+ var TfIdf = require ( "natural" ) . TfIdf ,
2
+ Segment = require ( "segment" ) . Segment ,
3
+ debug = require ( "debug" ) ( "keyword" ) ,
4
+ _ = require ( "lodash" ) ;
5
+
6
+ var count = function ( doc , word ) {
7
+ if ( ! doc ) {
8
+ return 0 ;
9
+ }
10
+ var c = 0 , i = 0 , hit = true ;
11
+ while ( hit && i <= doc . length ) {
12
+ if ( doc . indexOf ( word , i ) > - 1 ) {
13
+ c ++ ;
14
+ i += word . length ;
15
+ } else {
16
+ break ;
17
+ }
18
+ }
19
+ return c ;
20
+ } ;
21
+
22
+ exports . generate = function ( document ) {
23
+ var segment = new Segment ( ) ,
24
+ tfidf = new TfIdf ( ) ,
25
+ words , freqs = { } ,
26
+ rank = { } ;
27
+ segment . useDefault ( ) ;
28
+ words = segment . doSegment ( document ) . map ( function ( t ) {
29
+ return t . w ;
30
+ } ) ;
31
+ debug ( "segmented words %s" , words ) ;
32
+ tfidf . addDocument ( words ) ;
33
+ // words.forEach(function(item) {
34
+ // freqs[item] = count(words, item);
35
+ // if(!rank[item]) {
36
+ // rank[item] = tfidf.tfidf(item, 0);
37
+ // }
38
+ // debug(tfidf.listTerms(0).map(function(t) {return t.term;}));
39
+ // debug("%s, df %s, idf %s", item, freqs[item], rank[item]);
40
+ // });
41
+ // return Object.keys(rank).map(function(k) {
42
+ // return {
43
+ // word: k,
44
+ // value: rank[k]
45
+ // };
46
+ // }).sort(function(a, b) {
47
+ // return a.value - b.value;
48
+ // });
49
+ return _ . uniq ( tfidf . listTerms ( 0 ) , true ) ;
50
+ } ;
0 commit comments