Closed
Description
Current Circumstances:
I've integrated the stanford ner functionality into a java service that has tens of millions of calls per day. I deployed in 3 clusters and allocated 10GB of memory per pod.
Issue:
Over a period of about seven days, Pods not available and continued to report OutOfMemory Exceptions.
Calling Code:
private static final Log LOG = LogFactory.getInstance(StanfordNlpApp.class);
private static StanfordCoreNLP pipeline;
//executed once on system start
public static void initNLP() {
pipeline = new StanfordCoreNLP("stanford-hanweb-chinese.properties");
}
public static List<NerEntityBean> nerByText(String text) {
List<NerEntityBean> nerEntityBeans = new ArrayList<>();
long startTime = System.currentTimeMillis();
Annotation document = new Annotation(text);
pipeline.annotate(document);
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
String ner = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
int pos = token.beginPosition();
if (StringUtil.equals(ner, "O")) {
continue;
}
String word = token.get(CoreAnnotations.TextAnnotation.class);
NerEntityBean nerEntityBean = new NerEntityBean();
nerEntityBean.setPos(pos);
nerEntityBean.setWord(word);
nerEntityBean.setType(ner);
nerEntityBeans.add(nerEntityBean);
}
}
long endTime = System.currentTimeMillis();
long costTime = endTime - startTime;
LOG.debug("StanfordNlp cost : " + costTime);
return nerEntityBeans;
}
Configuration File:
# Pipeline options - lemma is no-op for Chinese but currently needed because coref demands it (bad old requirements system)
annotators = tokenize, ssplit, pos, lemma, ner, parse, coref
# segment
tokenize.language = zh
segment.model = edu/stanford/nlp/models/segmenter/chinese/ctb.gz
segment.sighanCorporaDict = edu/stanford/nlp/models/segmenter/chinese
segment.serDictionary = edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz,data/model/name-dict.ser.gz
segment.sighanPostProcessing = true
# sentence split
ssplit.boundaryTokenRegex = [.?]|[!???]+
# pos
pos.model = edu/stanford/nlp/models/pos-tagger/chinese-distsim.tagger
# ner
ner.language = chinese
ner.model = edu/stanford/nlp/models/ner/chinese.misc.distsim.crf.ser.gz
ner.applyNumericClassifiers = true
ner.useSUTime = false
# regexner
ner.fine.regexner.mapping = edu/stanford/nlp/models/kbp/chinese/gazetteers/cn_regexner_mapping.tab
ner.fine.regexner.noDefaultOverwriteLabels = CITY,COUNTRY,STATE_OR_PROVINCE
# parse
parse.model = edu/stanford/nlp/models/srparser/chineseSR.ser.gz
# depparse
depparse.model = edu/stanford/nlp/models/parser/nndep/UD_Chinese.gz
depparse.language = chinese
# coref
coref.sieves = ChineseHeadMatch, ExactStringMatch, PreciseConstructs, StrictHeadMatch1, StrictHeadMatch2, StrictHeadMatch3, StrictHeadMatch4, PronounMatch
coref.input.type = raw
coref.postprocessing = true
coref.calculateFeatureImportance = false
coref.useConstituencyTree = true
coref.useSemantics = false
coref.algorithm = hybrid
coref.path.word2vec =
coref.language = zh
coref.defaultPronounAgreement = true
coref.zh.dict = edu/stanford/nlp/models/dcoref/zh-attributes.txt.gz
coref.print.md.log = false
coref.md.type = RULE
coref.md.liberalChineseMD = false
# kbp
kbp.semgrex = edu/stanford/nlp/models/kbp/chinese/semgrex
kbp.tokensregex = edu/stanford/nlp/models/kbp/chinese/tokensregex
kbp.language = zh
kbp.model = none
# entitylink
entitylink.wikidict = edu/stanford/nlp/models/kbp/chinese/wikidict_chinese.tsv.gz
Call method should have no problem. In theory, memory should be cleaned by the GC and no more than 10GB of memory should be consumed. Can help solve the out of memory problem.Thanks a lot.
Metadata
Metadata
Assignees
Labels
No labels