2023年10月20日

Google Cloud SpeechとOpenAIのAPI使用してChatGPTに音声で質問してみる

今回はChatGPTに音声で質問する簡単なアプリケーションをJava言語で作成してみたいと思います。
テキストから音声への変換、音声からテキストへの変換はGoogle Cloud Speechを使用し、ChatGPTへの質問はOpenAIのAPIを使用します。

実行動画

システム構成

Google Cloud Speechの使用設定

Googleアカウントを作成、Google Cloudのプロジェクトのセットアップ。
APIを使用するにはクレジットカード番号登録が必要。
Speech-to-Text APIを有効にする。
Text-to-Speech APIを有効にする。
サービスアカウント(認証情報)を作成し、JSONファイルをダウンロードする。
JSONファイルを適切な場所に設置し、環境変数 GOOGLE_APPLICATION_CREDENTIALS を設定。

OpenAI APIの使用設定

OpenAIアカウントを作成。
APIを使用するにはクレジット番号登録をし、最低5ドルの課金しておく必要がある。
APIキーを作成し、キー文字列を取得する。
取得したキー文字列をプログラムでAPIを使用する際に使用する。

実装

構成

言語：Java
プロジェクト：Maven
作成クラス：App.java

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

	<modelVersion>4.0.0</modelVersion>

	<groupId>com.example</groupId>
	<artifactId>app</artifactId>
	<version>1.0-SNAPSHOT</version>

	<dependencyManagement>
		<dependencies>
			<dependency>
				<groupId>com.google.cloud</groupId>
				<artifactId>libraries-bom</artifactId>
				<version>26.19.0</version>
				<type>pom</type>
				<scope>import</scope>
			</dependency>
			</dependencies>
	</dependencyManagement>
	
	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<maven.compiler.source>1.8</maven.compiler.source>
		<maven.compiler.target>1.8</maven.compiler.target>
	</properties>

	<dependencies>
		<!-- Google Cloud Speech-to-Text client library -->
		<dependency>
			<groupId>com.google.cloud</groupId>
			<artifactId>google-cloud-speech</artifactId>
		</dependency>
		<dependency>
			<groupId>com.google.cloud</groupId>
			<artifactId>google-cloud-texttospeech</artifactId>
		</dependency>
		<dependency>
			<groupId>com.google.code.gson</groupId>
			<artifactId>gson</artifactId>
			<version>2.8.8</version>
		</dependency>

		<!-- Apache HttpComponents for HTTP requests -->
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.5.13</version>
		</dependency>
	</dependencies>

	<build>
		<plugins>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-shade-plugin</artifactId>
				<version>3.2.4</version>
				<executions>
					<execution>
						<phase>package</phase>
						<goals>
						    <goal>shade</goal>
						</goals>
						<configuration>
							<filters>
								<filter>
									<artifact>*:*</artifact>
									<excludes>
										<exclude>META-INF/*.SF</exclude>
										<exclude>META-INF/*.DSA</exclude>
										<exclude>META-INF/*.RSA</exclude>
									</excludes>
								</filter>
						    </filters>
						<transformers>
							<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
							<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
								<mainClass>com.example.App</mainClass>
							</transformer>
						</transformers>
						</configuration>
					</execution>
				</executions>
			</plugin>
		</plugins>
	</build>
</project>

App.java

package com.example;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Scanner;
import com.google.cloud.speech.v1.*;
import com.google.protobuf.ByteString;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import org.apache.http.HttpHeaders;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.util.*;
import com.google.cloud.texttospeech.v1.*;
import com.google.protobuf.ByteString;

import javax.sound.sampled.*;
import java.io.ByteArrayInputStream;
import java.io.InputStream;


public class App {
	
	private static final String API_URL = "https://api.openai.com/v1/chat/completions"; //OpenAI APIのURLを指定
	private static final String API_KEY = "sk-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"; // OpenAI APIキーをセット


	public static void main(String[] args) throws Exception {
		final int SAMPLE_RATE = 16000;
		final int SAMPLE_SIZE = 16;
		final int CHANNELS = 1;
		final boolean SIGNED = true;
		final boolean BIG_ENDIAN = false;

		//マイクから音声取得するための設定
		AudioFormat format = new AudioFormat(SAMPLE_RATE, SAMPLE_SIZE, CHANNELS, SIGNED, BIG_ENDIAN);
		DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);

		if (!AudioSystem.isLineSupported(info)) {
			System.out.println("Line not supported");
			System.exit(0);
		}


		JsonArray messageHistoryArray = new JsonArray();

		while( true ){

			//マイクから音声取得するための設定
			TargetDataLine microphone = (TargetDataLine) AudioSystem.getLine(info);
			microphone.open(format);
			microphone.start();

			ByteArrayOutputStream outStream = new ByteArrayOutputStream();

			//マイクの情報を取得し、Streamに格納するスレッド
			Thread recordingThread = new Thread(() -> {
				while (true) {
					byte[] buffer = new byte[4096];
					int bytesRead = microphone.read(buffer, 0, buffer.length);
					outStream.write(buffer, 0, bytesRead);
				}
			});

			recordingThread.start();

			//Enterが押されたら、マイクを閉じてGoogle CloudのSpeechToTextにテキストへの変換依頼をする
			Scanner scanner = new Scanner(System.in);

			System.out.println("質問をしてEnterボタンを押下して下さい。");
			String line = scanner.nextLine();  // Wait for Enter key press


			microphone.stop();
			microphone.close();


			if( line.equals("exit")) {;
				recordingThread.stop();
				break;
			}

			recordingThread.stop();

			//Google CloudのSpeechToTextにテキストへの変換依頼
			String messageStr = sendToGoogleSpeech(outStream.toByteArray());

			System.out.println("-----質問-----");
			System.out.println( messageStr );
			System.out.println("");
			System.out.println("質問中…");

			//ロールとメッセージを設定。話の流れが分かるようにする為に質問の履歴も設定している。
			JsonObject message = new JsonObject();
			message.addProperty("role", "user");
			message.addProperty("content", messageStr);
			messageHistoryArray.add( message );

			//ChatGPTに質問を送信し結果を取得
			String response = requestCompletion(messageHistoryArray);
			System.out.println("-----返信-----");
			System.out.println( response);
			System.out.println("");
				
			//取得した返信を音声に変換し再生する。
			playText( response );
		}

	}

	//Speech To Textのライブラリを使用して、音声データをテキストに変換
	private static String sendToGoogleSpeech(byte[] audioData) throws Exception{
		try (SpeechClient speechClient = SpeechClient.create()) {

			ByteString audioBytes = ByteString.copyFrom(audioData);

			RecognitionConfig config = RecognitionConfig.newBuilder()
				.setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
				.setSampleRateHertz(16000)
				.setLanguageCode("ja-JP")
				.build();

			RecognitionAudio audio = RecognitionAudio.newBuilder()
				.setContent(audioBytes)
				.build();

			RecognizeResponse response = speechClient.recognize(config, audio);

			String allMessage = "";
			for (SpeechRecognitionResult result : response.getResultsList()) {
				String message = result.getAlternativesList().get(0).getTranscript();
				allMessage += message;
			}

			return allMessage;
		} catch (Exception e) {
			e.printStackTrace();
			throw e;
		}
	}

	//質問事項をOpenAIのAPI経由でChatGPTに送信し、返信を取得する
	public static String requestCompletion(JsonArray messages) {
		try (CloseableHttpClient httpClient = HttpClients.createDefault()) {

			HttpPost postRequest = new HttpPost(API_URL);
			postRequest.setHeader(HttpHeaders.AUTHORIZATION, "Bearer " + API_KEY);
			postRequest.setHeader(HttpHeaders.CONTENT_TYPE, "application/json");

			JsonObject requestBody = new JsonObject();
			requestBody.addProperty("model", "gpt-3.5-turbo");
			requestBody.add("messages", messages);

			StringEntity entity = new StringEntity(requestBody.toString(), "UTF-8");
			postRequest.setEntity(entity);

			try (CloseableHttpResponse response = httpClient.execute(postRequest)) {
				String responseBody = EntityUtils.toString(response.getEntity(), "UTF-8");
				JsonObject jsonResponse = JsonParser.parseString(responseBody).getAsJsonObject();

				return jsonResponse.getAsJsonArray("choices").get(0).getAsJsonObject().get("message").getAsJsonObject().get("content").getAsString();
			}

		} catch (Exception e) {
			e.printStackTrace();
			return null;
		}
	}

	//テキストをGoogle Text to Speechのライブラリを使用して音声に変換し再生する。
	public static void playText(String text) throws Exception {
		ByteString audioData = synthesizeText(text);
		playAudio(audioData.toByteArray());
	}

	//テキストをGoogle Text to Speechのライブラリを使用して音声に変換する。
	public static ByteString synthesizeText(String text) throws Exception {
		try (TextToSpeechClient textToSpeechClient = TextToSpeechClient.create()) {
			SynthesisInput input = SynthesisInput.newBuilder().setText(text).build();
			VoiceSelectionParams voice = VoiceSelectionParams.newBuilder().setLanguageCode("ja-JP").build();
			AudioConfig audioConfig = AudioConfig.newBuilder().setAudioEncoding(AudioEncoding.LINEAR16).build(); // LINEAR16 for javax.sound.sampled

			return textToSpeechClient.synthesizeSpeech(input, voice, audioConfig).getAudioContent();
		}
	}

	//音声データを再生する。
	public static void playAudio(byte[] audioData) throws Exception {
		InputStream audioStream = new ByteArrayInputStream(audioData);
		AudioInputStream ais = AudioSystem.getAudioInputStream(audioStream);
		AudioFormat format = ais.getFormat();
		DataLine.Info info = new DataLine.Info(SourceDataLine.class, format);
		SourceDataLine audioLine = (SourceDataLine) AudioSystem.getLine(info);

		audioLine.open(format);
		audioLine.start();

		byte[] buffer = new byte[4096];
		int bytesRead = -1;

		while ((bytesRead = ais.read(buffer)) != -1) {
			audioLine.write(buffer, 0, bytesRead);
		}

		audioLine.drain();
		audioLine.close();
		ais.close();
	}
}

最後に

動画では結構スムーズに実行出来ていますが、ChatGPTからの返信が長くなると結構待たされます。
業務システムで効果的な使用方法があるかは分かりませんが、面白そうだったのでやってみました。
今度は長い返信でも、ストレスないスピード感が出せるか調べてみようと思います。