将项目 clone 下来之后,直接使用下面的脚本进行编译即可,
#
# 编译 Release 版本: .\\build_windows.ps1 -Config Release -KillRunning -CleanOutputs
# 编译 Debug 版本: .\\build_windows.ps1 -Config Debug -KillRunning -CleanOutputs
#
param(
[ValidateSet("Release", "Debug")]
[string]$Config = "Release",
[string]$BuildDir = "",
[switch]$Clean,
[switch]$KillRunning,
[switch]$CleanOutputs
)
$ErrorActionPreference = "Stop"
function Find-VSInstallPath {
$vswhere = "C:\\Program Files (x86)\\Microsoft Visual Studio\\Installer\\vswhere.exe"
if (-not (Test-Path $vswhere)) {
throw "vswhere.exe not found. Please install Visual Studio Build Tools or Visual Studio."
}
$path = & $vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
if (-not $path) {
throw "No Visual Studio installation with C++ tools found."
}
return $path.Trim()
}
function Resolve-VcpkgRoot {
if ($env:VCPKG_ROOT -and (Test-Path $env:VCPKG_ROOT)) {
return $env:VCPKG_ROOT
}
$scoopVcpkg = "C:\\Users\\$env:USERNAME\\scoop\\apps\\vcpkg\\current"
if (Test-Path $scoopVcpkg) {
return $scoopVcpkg
}
throw "VCPKG_ROOT is not set and vcpkg was not found under scoop. Set VCPKG_ROOT to your vcpkg folder."
}
function Resolve-BoostRoot {
if ($env:BOOST_ROOT -and (Test-Path $env:BOOST_ROOT)) {
return $env:BOOST_ROOT
}
$scoopBoost = "C:\\Users\\$env:USERNAME\\scoop\\apps\\boost\\current"
if (Test-Path $scoopBoost) {
return $scoopBoost
}
return ""
}
function Invoke-VSCommand {
param([string]$VsDevCmd, [string]$Command)
$cmd = "`"$VsDevCmd`" -arch=x64 && $Command"
& cmd /c $cmd
if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE }
}
$vsPath = Find-VSInstallPath
$vsDevCmd = Join-Path $vsPath "Common7\\Tools\\VsDevCmd.bat"
if (-not (Test-Path $vsDevCmd)) {
throw "VsDevCmd.bat not found at $vsDevCmd"
}
$vcpkgRoot = Resolve-VcpkgRoot
$vcpkgExe = Join-Path $vcpkgRoot "vcpkg.exe"
if (-not (Test-Path $vcpkgExe)) {
throw "vcpkg.exe not found at $vcpkgExe"
}
if (-not $BuildDir) {
$BuildDir = "build_ninja_$($Config.ToLower())"
}
if ($Clean -and (Test-Path $BuildDir)) {
Remove-Item -Recurse -Force $BuildDir
}
Write-Host "Using VS: $vsPath"
Write-Host "Using vcpkg: $vcpkgRoot"
Write-Host "Build dir: $BuildDir"
Write-Host "Config: $Config"
if (Test-Path (Join-Path $BuildDir "bin")) {
$exeNames = @("lmplz", "query", "build_binary", "filter", "count_ngrams", "kenlm_benchmark", "fragment", "phrase_table_vocab")
$running = Get-Process -ErrorAction SilentlyContinue | Where-Object { $exeNames -contains $_.ProcessName }
if ($running) {
if ($KillRunning) {
$running | Stop-Process -Force
}
else {
$names = ($running | Select-Object -ExpandProperty ProcessName | Sort-Object -Unique) -join ", "
throw "Detected running KenLM executables ($names). Close them or rerun with -KillRunning."
}
}
if ($CleanOutputs) {
$binDir = Join-Path $BuildDir "bin"
$toDelete = @()
foreach ($name in $exeNames) {
$toDelete += @(
Join-Path $binDir "$name.exe",
Join-Path $binDir "$name.pdb",
Join-Path $binDir "$name.ilk",
Join-Path $binDir "$name.lib",
Join-Path $binDir "$name.exp"
)
}
foreach ($path in $toDelete) {
if (Test-Path $path) {
try {
Remove-Item -Force $path
}
catch {
throw "Failed to delete $path. Close any running process using it or rerun with -KillRunning."
}
}
}
}
}
Invoke-VSCommand $vsDevCmd "`"$vcpkgExe`" --vcpkg-root `"$vcpkgRoot`" install zlib bzip2 liblzma --triplet x64-windows"
$toolchain = Join-Path $vcpkgRoot "scripts\\buildsystems\\vcpkg.cmake"
if (-not (Test-Path $toolchain)) {
throw "vcpkg toolchain file not found: $toolchain"
}
Invoke-VSCommand $vsDevCmd "cmake -S . -B `"$BuildDir`" -G Ninja -DCMAKE_BUILD_TYPE=$Config -DCMAKE_TOOLCHAIN_FILE=`"$toolchain`""
Invoke-VSCommand $vsDevCmd "cmake --build `"$BuildDir`""
if (-not (Test-Path (Join-Path $BuildDir "bin"))) {
throw "Expected output bin directory not found: $BuildDir\\bin"
}
$boostRoot = Resolve-BoostRoot
if ($boostRoot) {
$boostLib = Join-Path $boostRoot "lib"
$suffix = if ($Config -eq "Debug") { "*-mt-gd-x64-1_*.dll" } else { "*-mt-x64-1_*.dll" }
$boostDlls = @("boost_program_options", "boost_thread", "boost_system")
foreach ($name in $boostDlls) {
$pattern = "$name-$suffix"
$matches = Get-ChildItem -Path $boostLib -Filter $pattern -ErrorAction SilentlyContinue
if ($matches) {
Copy-Item -Force -Path $matches.FullName -Destination (Join-Path $BuildDir "bin")
}
else {
Write-Warning "Boost DLL not found for $name in $boostLib"
}
}
}
else {
Write-Warning "BOOST_ROOT not set and boost not found under scoop; skipping Boost DLL copy."
}
Write-Host "Build complete. Binaries are in $BuildDir\\bin"
使用的话,相关的命令行命名使用示例如下。
下面这个是构建 3-gram 的,最终将生成一个 model.arpa 文件。
cmd /c ".\\bin\\lmplz.exe -o 3 < C:\\Users\\SonnyCalcr\\EDisk\\CppCodes\\IMECodes\\Metasequoia-n-gram\\data\\output\\handled\\all_cleaned_only_wiki_zh_spaced_v1.txt > model.arpa"
然后再利用 model.arpa 来制作 model.binary。
.\\bin\\build_binary.exe model.arpa model.binary
注意,这里的语料的格式是一行一行的,并且每个字符之间都是用空格隔开的,譬如,
我 爱 北 京
我 爱 中 国
我 爱 上 海
北 京 是 中 国 首 都
在运行命令的时候,需要让系统至少有 40GB 的运行内存的空闲,如果需要处理的语料是比较大的话。