|
Java example source code file (TextPipeline.java)
This example Java source code file (TextPipeline.java) is included in the alvinalexander.com
"Java Source Code
Warehouse" project. The intent of this project is to help you "Learn
Java by Example" TM.
Learn more about this Java project at its project page.
The TextPipeline.java Java example source code
/*
*
* * Copyright 2015 Skymind,Inc.
* *
* * Licensed under the Apache License, Version 2.0 (the "License");
* * you may not use this file except in compliance with the License.
* * You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
*
*/
package org.deeplearning4j.spark.text.functions;
import org.apache.spark.Accumulator;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.deeplearning4j.berkeley.Counter;
import org.deeplearning4j.berkeley.Pair;
import org.deeplearning4j.models.embeddings.loader.VectorsConfiguration;
import org.deeplearning4j.models.word2vec.Huffman;
import org.deeplearning4j.models.word2vec.VocabWord;
import org.deeplearning4j.models.word2vec.wordstore.VocabCache;
import org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache;
import org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache;
import org.deeplearning4j.spark.text.accumulators.WordFreqAccumulator;
import org.deeplearning4j.text.stopwords.StopWords;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicLong;
/**
* A spark based text pipeline
* with minimum word frequency and stop words
*
* @author Adam Gibson
*/
@SuppressWarnings("unchecked")
public class TextPipeline {
//params
private JavaRDD<String> corpusRDD;
private int numWords;
private int nGrams;
private String tokenizer;
private String tokenizerPreprocessor;
private List<String> stopWords = new ArrayList<>();
//Setup
private JavaSparkContext sc;
private Accumulator<Counter wordFreqAcc;
private Broadcast<List stopWordBroadCast;
// Return values
private JavaRDD<Pair> sentenceWordsCountRDD;
private VocabCache<VocabWord> vocabCache = new AbstractCache();
private Broadcast<VocabCache vocabCacheBroadcast;
private JavaRDD<List vocabWordListRDD;
private JavaRDD<AtomicLong> sentenceCountRDD;
private long totalWordCount;
private boolean useUnk;
private VectorsConfiguration configuration;
// Empty Constructor
public TextPipeline() {}
// Constructor
public TextPipeline(JavaRDD<String> corpusRDD, Broadcast
Other Java examples (source code examples)
Here is a short list of links related to this Java TextPipeline.java source code file:
|