|
Java example source code file (Word2VecTest.java)
The Word2VecTest.java Java example source code/* * * * Copyright 2015 Skymind,Inc. * * * * Licensed under the Apache License, Version 2.0 (the "License"); * * you may not use this file except in compliance with the License. * * You may obtain a copy of the License at * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * * limitations under the License. * */ package org.deeplearning4j.spark.models.embeddings.word2vec; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.canova.api.util.ClassPathResource; import org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable; import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; import org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils; import org.deeplearning4j.models.embeddings.reader.impl.FlatModelUtils; import org.deeplearning4j.models.embeddings.wordvectors.WordVectors; import org.deeplearning4j.models.word2vec.VocabWord; import org.deeplearning4j.models.word2vec.wordstore.VocabCache; import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory; import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; import org.junit.Ignore; import org.junit.Test; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.factory.Nd4j; import java.io.File; import java.util.Arrays; import java.util.Collection; import static org.junit.Assert.*; /** * @author jeffreytang */ public class Word2VecTest { @Test public void testConcepts() throws Exception { // These are all default values for word2vec SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest"); // Set SparkContext JavaSparkContext sc = new JavaSparkContext(sparkConf); // Path of data part-00000 String dataPath = new ClassPathResource("/big/raw_sentences.txt").getFile().getAbsolutePath(); // dataPath = "/ext/Temp/part-00000"; // String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath(); // Read in data JavaRDD<String> corpus = sc.textFile(dataPath); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec word2Vec = new Word2Vec.Builder() .setNGrams(1) // .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory") // .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor") // .setRemoveStop(false) .tokenizerFactory(t) .seed(42L) .negative(10) .useAdaGrad(false) .layerSize(150) .windowSize(5) .learningRate(0.025) .minLearningRate(0.0001) .iterations(1) .batchSize(100) .minWordFrequency(5) .stopWords(Arrays.asList("three")) .useUnknown(true) .build(); word2Vec.train(corpus); //word2Vec.setModelUtils(new FlatModelUtils()); System.out.println("UNK: " + word2Vec.getWordVectorMatrix("UNK")); InMemoryLookupTable<VocabWord> table = (InMemoryLookupTable Other Java examples (source code examples)Here is a short list of links related to this Java Word2VecTest.java source code file: |
... this post is sponsored by my books ... | |
#1 New Release! |
FP Best Seller |
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.